From ace9429bb58fd418f0c81d4c2835699bddf6bde6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Thu, 11 Apr 2024 10:27:49 +0200 Subject: Adding upstream version 6.6.15. Signed-off-by: Daniel Baumann --- fs/btrfs/Kconfig | 111 + fs/btrfs/Makefile | 47 + fs/btrfs/accessors.c | 174 + fs/btrfs/accessors.h | 1072 +++ fs/btrfs/acl.c | 130 + fs/btrfs/acl.h | 27 + fs/btrfs/async-thread.c | 375 ++ fs/btrfs/async-thread.h | 48 + fs/btrfs/backref.c | 3648 ++++++++++ fs/btrfs/backref.h | 556 ++ fs/btrfs/bio.c | 872 +++ fs/btrfs/bio.h | 111 + fs/btrfs/block-group.c | 4545 +++++++++++++ fs/btrfs/block-group.h | 370 ++ fs/btrfs/block-rsv.c | 567 ++ fs/btrfs/block-rsv.h | 104 + fs/btrfs/btrfs_inode.h | 534 ++ fs/btrfs/check-integrity.c | 2871 ++++++++ fs/btrfs/check-integrity.h | 20 + fs/btrfs/compression.c | 1464 +++++ fs/btrfs/compression.h | 176 + fs/btrfs/ctree.c | 5224 +++++++++++++++ fs/btrfs/ctree.h | 730 ++ fs/btrfs/defrag.c | 1379 ++++ fs/btrfs/defrag.h | 22 + fs/btrfs/delalloc-space.c | 496 ++ fs/btrfs/delalloc-space.h | 27 + fs/btrfs/delayed-inode.c | 2200 +++++++ fs/btrfs/delayed-inode.h | 177 + fs/btrfs/delayed-ref.c | 1160 ++++ fs/btrfs/delayed-ref.h | 428 ++ fs/btrfs/dev-replace.c | 1291 ++++ fs/btrfs/dev-replace.h | 37 + fs/btrfs/dir-item.c | 445 ++ fs/btrfs/dir-item.h | 42 + fs/btrfs/discard.c | 777 +++ fs/btrfs/discard.h | 39 + fs/btrfs/disk-io.c | 5005 ++++++++++++++ fs/btrfs/disk-io.h | 131 + fs/btrfs/export.c | 294 + fs/btrfs/export.h | 25 + fs/btrfs/extent-io-tree.c | 1779 +++++ fs/btrfs/extent-io-tree.h | 198 + fs/btrfs/extent-tree.c | 6177 +++++++++++++++++ fs/btrfs/extent-tree.h | 151 + fs/btrfs/extent_io.c | 4676 +++++++++++++ fs/btrfs/extent_io.h | 313 + fs/btrfs/extent_map.c | 1053 +++ fs/btrfs/extent_map.h | 114 + fs/btrfs/file-item.c | 1354 ++++ fs/btrfs/file-item.h | 73 + fs/btrfs/file.c | 3864 +++++++++++ fs/btrfs/file.h | 33 + fs/btrfs/free-space-cache.c | 4334 ++++++++++++ fs/btrfs/free-space-cache.h | 162 + fs/btrfs/free-space-tree.c | 1667 +++++ fs/btrfs/free-space-tree.h | 54 + fs/btrfs/fs.c | 98 + fs/btrfs/fs.h | 999 +++ fs/btrfs/inode-item.c | 751 +++ fs/btrfs/inode-item.h | 110 + fs/btrfs/inode.c | 10964 +++++++++++++++++++++++++++++++ fs/btrfs/ioctl.c | 4741 +++++++++++++ fs/btrfs/ioctl.h | 17 + fs/btrfs/locking.c | 393 ++ fs/btrfs/locking.h | 224 + fs/btrfs/lru_cache.c | 166 + fs/btrfs/lru_cache.h | 75 + fs/btrfs/lzo.c | 493 ++ fs/btrfs/messages.c | 313 + fs/btrfs/messages.h | 222 + fs/btrfs/misc.h | 164 + fs/btrfs/ordered-data.c | 1264 ++++ fs/btrfs/ordered-data.h | 214 + fs/btrfs/orphan.c | 59 + fs/btrfs/orphan.h | 11 + fs/btrfs/print-tree.c | 414 ++ fs/btrfs/print-tree.h | 16 + fs/btrfs/props.c | 473 ++ fs/btrfs/props.h | 26 + fs/btrfs/qgroup.c | 4443 +++++++++++++ fs/btrfs/qgroup.h | 452 ++ fs/btrfs/raid56.c | 2782 ++++++++ fs/btrfs/raid56.h | 201 + fs/btrfs/rcu-string.h | 52 + fs/btrfs/ref-verify.c | 1028 +++ fs/btrfs/ref-verify.h | 49 + fs/btrfs/reflink.c | 935 +++ fs/btrfs/reflink.h | 12 + fs/btrfs/relocation.c | 4573 +++++++++++++ fs/btrfs/relocation.h | 24 + fs/btrfs/root-tree.c | 549 ++ fs/btrfs/root-tree.h | 34 + fs/btrfs/scrub.c | 3062 +++++++++ fs/btrfs/scrub.h | 16 + fs/btrfs/send.c | 8413 ++++++++++++++++++++++++ fs/btrfs/send.h | 185 + fs/btrfs/space-info.c | 1853 ++++++ fs/btrfs/space-info.h | 240 + fs/btrfs/subpage.c | 754 +++ fs/btrfs/subpage.h | 158 + fs/btrfs/super.c | 2557 +++++++ fs/btrfs/super.h | 29 + fs/btrfs/sysfs.c | 2397 +++++++ fs/btrfs/sysfs.h | 42 + fs/btrfs/tests/btrfs-tests.c | 304 + fs/btrfs/tests/btrfs-tests.h | 57 + fs/btrfs/tests/extent-buffer-tests.c | 223 + fs/btrfs/tests/extent-io-tests.c | 812 +++ fs/btrfs/tests/extent-map-tests.c | 1047 +++ fs/btrfs/tests/free-space-tests.c | 1063 +++ fs/btrfs/tests/free-space-tree-tests.c | 589 ++ fs/btrfs/tests/inode-tests.c | 1108 ++++ fs/btrfs/tests/qgroup-tests.c | 559 ++ fs/btrfs/transaction.c | 2682 ++++++++ fs/btrfs/transaction.h | 275 + fs/btrfs/tree-checker.c | 2037 ++++++ fs/btrfs/tree-checker.h | 72 + fs/btrfs/tree-log.c | 7534 +++++++++++++++++++++ fs/btrfs/tree-log.h | 110 + fs/btrfs/tree-mod-log.c | 1114 ++++ fs/btrfs/tree-mod-log.h | 53 + fs/btrfs/ulist.c | 284 + fs/btrfs/ulist.h | 74 + fs/btrfs/uuid-tree.c | 393 ++ fs/btrfs/uuid-tree.h | 12 + fs/btrfs/verity.c | 811 +++ fs/btrfs/verity.h | 28 + fs/btrfs/volumes.c | 8104 +++++++++++++++++++++++ fs/btrfs/volumes.h | 752 +++ fs/btrfs/xattr.c | 492 ++ fs/btrfs/xattr.h | 25 + fs/btrfs/zlib.c | 454 ++ fs/btrfs/zoned.c | 2533 +++++++ fs/btrfs/zoned.h | 403 ++ fs/btrfs/zstd.c | 702 ++ 136 files changed, 155506 insertions(+) create mode 100644 fs/btrfs/Kconfig create mode 100644 fs/btrfs/Makefile create mode 100644 fs/btrfs/accessors.c create mode 100644 fs/btrfs/accessors.h create mode 100644 fs/btrfs/acl.c create mode 100644 fs/btrfs/acl.h create mode 100644 fs/btrfs/async-thread.c create mode 100644 fs/btrfs/async-thread.h create mode 100644 fs/btrfs/backref.c create mode 100644 fs/btrfs/backref.h create mode 100644 fs/btrfs/bio.c create mode 100644 fs/btrfs/bio.h create mode 100644 fs/btrfs/block-group.c create mode 100644 fs/btrfs/block-group.h create mode 100644 fs/btrfs/block-rsv.c create mode 100644 fs/btrfs/block-rsv.h create mode 100644 fs/btrfs/btrfs_inode.h create mode 100644 fs/btrfs/check-integrity.c create mode 100644 fs/btrfs/check-integrity.h create mode 100644 fs/btrfs/compression.c create mode 100644 fs/btrfs/compression.h create mode 100644 fs/btrfs/ctree.c create mode 100644 fs/btrfs/ctree.h create mode 100644 fs/btrfs/defrag.c create mode 100644 fs/btrfs/defrag.h create mode 100644 fs/btrfs/delalloc-space.c create mode 100644 fs/btrfs/delalloc-space.h create mode 100644 fs/btrfs/delayed-inode.c create mode 100644 fs/btrfs/delayed-inode.h create mode 100644 fs/btrfs/delayed-ref.c create mode 100644 fs/btrfs/delayed-ref.h create mode 100644 fs/btrfs/dev-replace.c create mode 100644 fs/btrfs/dev-replace.h create mode 100644 fs/btrfs/dir-item.c create mode 100644 fs/btrfs/dir-item.h create mode 100644 fs/btrfs/discard.c create mode 100644 fs/btrfs/discard.h create mode 100644 fs/btrfs/disk-io.c create mode 100644 fs/btrfs/disk-io.h create mode 100644 fs/btrfs/export.c create mode 100644 fs/btrfs/export.h create mode 100644 fs/btrfs/extent-io-tree.c create mode 100644 fs/btrfs/extent-io-tree.h create mode 100644 fs/btrfs/extent-tree.c create mode 100644 fs/btrfs/extent-tree.h create mode 100644 fs/btrfs/extent_io.c create mode 100644 fs/btrfs/extent_io.h create mode 100644 fs/btrfs/extent_map.c create mode 100644 fs/btrfs/extent_map.h create mode 100644 fs/btrfs/file-item.c create mode 100644 fs/btrfs/file-item.h create mode 100644 fs/btrfs/file.c create mode 100644 fs/btrfs/file.h create mode 100644 fs/btrfs/free-space-cache.c create mode 100644 fs/btrfs/free-space-cache.h create mode 100644 fs/btrfs/free-space-tree.c create mode 100644 fs/btrfs/free-space-tree.h create mode 100644 fs/btrfs/fs.c create mode 100644 fs/btrfs/fs.h create mode 100644 fs/btrfs/inode-item.c create mode 100644 fs/btrfs/inode-item.h create mode 100644 fs/btrfs/inode.c create mode 100644 fs/btrfs/ioctl.c create mode 100644 fs/btrfs/ioctl.h create mode 100644 fs/btrfs/locking.c create mode 100644 fs/btrfs/locking.h create mode 100644 fs/btrfs/lru_cache.c create mode 100644 fs/btrfs/lru_cache.h create mode 100644 fs/btrfs/lzo.c create mode 100644 fs/btrfs/messages.c create mode 100644 fs/btrfs/messages.h create mode 100644 fs/btrfs/misc.h create mode 100644 fs/btrfs/ordered-data.c create mode 100644 fs/btrfs/ordered-data.h create mode 100644 fs/btrfs/orphan.c create mode 100644 fs/btrfs/orphan.h create mode 100644 fs/btrfs/print-tree.c create mode 100644 fs/btrfs/print-tree.h create mode 100644 fs/btrfs/props.c create mode 100644 fs/btrfs/props.h create mode 100644 fs/btrfs/qgroup.c create mode 100644 fs/btrfs/qgroup.h create mode 100644 fs/btrfs/raid56.c create mode 100644 fs/btrfs/raid56.h create mode 100644 fs/btrfs/rcu-string.h create mode 100644 fs/btrfs/ref-verify.c create mode 100644 fs/btrfs/ref-verify.h create mode 100644 fs/btrfs/reflink.c create mode 100644 fs/btrfs/reflink.h create mode 100644 fs/btrfs/relocation.c create mode 100644 fs/btrfs/relocation.h create mode 100644 fs/btrfs/root-tree.c create mode 100644 fs/btrfs/root-tree.h create mode 100644 fs/btrfs/scrub.c create mode 100644 fs/btrfs/scrub.h create mode 100644 fs/btrfs/send.c create mode 100644 fs/btrfs/send.h create mode 100644 fs/btrfs/space-info.c create mode 100644 fs/btrfs/space-info.h create mode 100644 fs/btrfs/subpage.c create mode 100644 fs/btrfs/subpage.h create mode 100644 fs/btrfs/super.c create mode 100644 fs/btrfs/super.h create mode 100644 fs/btrfs/sysfs.c create mode 100644 fs/btrfs/sysfs.h create mode 100644 fs/btrfs/tests/btrfs-tests.c create mode 100644 fs/btrfs/tests/btrfs-tests.h create mode 100644 fs/btrfs/tests/extent-buffer-tests.c create mode 100644 fs/btrfs/tests/extent-io-tests.c create mode 100644 fs/btrfs/tests/extent-map-tests.c create mode 100644 fs/btrfs/tests/free-space-tests.c create mode 100644 fs/btrfs/tests/free-space-tree-tests.c create mode 100644 fs/btrfs/tests/inode-tests.c create mode 100644 fs/btrfs/tests/qgroup-tests.c create mode 100644 fs/btrfs/transaction.c create mode 100644 fs/btrfs/transaction.h create mode 100644 fs/btrfs/tree-checker.c create mode 100644 fs/btrfs/tree-checker.h create mode 100644 fs/btrfs/tree-log.c create mode 100644 fs/btrfs/tree-log.h create mode 100644 fs/btrfs/tree-mod-log.c create mode 100644 fs/btrfs/tree-mod-log.h create mode 100644 fs/btrfs/ulist.c create mode 100644 fs/btrfs/ulist.h create mode 100644 fs/btrfs/uuid-tree.c create mode 100644 fs/btrfs/uuid-tree.h create mode 100644 fs/btrfs/verity.c create mode 100644 fs/btrfs/verity.h create mode 100644 fs/btrfs/volumes.c create mode 100644 fs/btrfs/volumes.h create mode 100644 fs/btrfs/xattr.c create mode 100644 fs/btrfs/xattr.h create mode 100644 fs/btrfs/zlib.c create mode 100644 fs/btrfs/zoned.c create mode 100644 fs/btrfs/zoned.h create mode 100644 fs/btrfs/zstd.c (limited to 'fs/btrfs') diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig new file mode 100644 index 0000000000..a25c9910d9 --- /dev/null +++ b/fs/btrfs/Kconfig @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: GPL-2.0 + +config BTRFS_FS + tristate "Btrfs filesystem support" + select BLK_CGROUP_PUNT_BIO + select CRYPTO + select CRYPTO_CRC32C + select LIBCRC32C + select CRYPTO_XXHASH + select CRYPTO_SHA256 + select CRYPTO_BLAKE2B + select ZLIB_INFLATE + select ZLIB_DEFLATE + select LZO_COMPRESS + select LZO_DECOMPRESS + select ZSTD_COMPRESS + select ZSTD_DECOMPRESS + select FS_IOMAP + select RAID6_PQ + select XOR_BLOCKS + depends on PAGE_SIZE_LESS_THAN_256KB + + help + Btrfs is a general purpose copy-on-write filesystem with extents, + writable snapshotting, support for multiple devices and many more + features focused on fault tolerance, repair and easy administration. + + The filesystem disk format is no longer unstable, and it's not + expected to change unless there are strong reasons to do so. If there + is a format change, file systems with a unchanged format will + continue to be mountable and usable by newer kernels. + + For more information, please see the web pages at + https://btrfs.readthedocs.io + + To compile this file system support as a module, choose M here. The + module will be called btrfs. + + If unsure, say N. + +config BTRFS_FS_POSIX_ACL + bool "Btrfs POSIX Access Control Lists" + depends on BTRFS_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + If you don't know what Access Control Lists are, say N + +config BTRFS_FS_CHECK_INTEGRITY + bool "Btrfs with integrity check tool compiled in (DEPRECATED)" + depends on BTRFS_FS + help + This feature has been deprecated and will be removed in 6.7. + + Adds code that examines all block write requests (including + writes of the super block). The goal is to verify that the + state of the filesystem on disk is always consistent, i.e., + after a power-loss or kernel panic event the filesystem is + in a consistent state. + + If the integrity check tool is included and activated in + the mount options, plenty of kernel memory is used, and + plenty of additional CPU cycles are spent. Enabling this + functionality is not intended for normal use. + + In most cases, unless you are a btrfs developer who needs + to verify the integrity of (super)-block write requests + during the run of a regression test, say N + +config BTRFS_FS_RUN_SANITY_TESTS + bool "Btrfs will run sanity tests upon loading" + depends on BTRFS_FS + help + This will run some basic sanity tests on the free space cache + code to make sure it is acting as it should. These are mostly + regression tests and are only really interesting to btrfs + developers. + + If unsure, say N. + +config BTRFS_DEBUG + bool "Btrfs debugging support" + depends on BTRFS_FS + help + Enable run-time debugging support for the btrfs filesystem. This may + enable additional and expensive checks with negative impact on + performance, or export extra information via sysfs. + + If unsure, say N. + +config BTRFS_ASSERT + bool "Btrfs assert support" + depends on BTRFS_FS + help + Enable run-time assertion checking. This will result in panics if + any of the assertions trip. This is meant for btrfs developers only. + + If unsure, say N. + +config BTRFS_FS_REF_VERIFY + bool "Btrfs with the ref verify tool compiled in" + depends on BTRFS_FS + default n + help + Enable run-time extent reference verification instrumentation. This + is meant to be used by btrfs developers for tracking down extent + reference problems or verifying they didn't break something. + + If unsure, say N. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile new file mode 100644 index 0000000000..90d5320975 --- /dev/null +++ b/fs/btrfs/Makefile @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: GPL-2.0 + +# Subset of W=1 warnings +subdir-ccflags-y += -Wextra -Wunused -Wno-unused-parameter +subdir-ccflags-y += -Wmissing-declarations +subdir-ccflags-y += -Wmissing-format-attribute +subdir-ccflags-y += -Wmissing-prototypes +subdir-ccflags-y += -Wold-style-definition +subdir-ccflags-y += -Wmissing-include-dirs +condflags := \ + $(call cc-option, -Wunused-but-set-variable) \ + $(call cc-option, -Wunused-const-variable) \ + $(call cc-option, -Wpacked-not-aligned) \ + $(call cc-option, -Wstringop-truncation) \ + $(call cc-option, -Wmaybe-uninitialized) +subdir-ccflags-y += $(condflags) +# The following turn off the warnings enabled by -Wextra +subdir-ccflags-y += -Wno-missing-field-initializers +subdir-ccflags-y += -Wno-sign-compare +subdir-ccflags-y += -Wno-type-limits +subdir-ccflags-y += -Wno-shift-negative-value + +obj-$(CONFIG_BTRFS_FS) := btrfs.o + +btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ + file-item.o inode-item.o disk-io.o \ + transaction.o inode.o file.o defrag.o \ + extent_map.o sysfs.o accessors.o xattr.o ordered-data.o \ + extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ + export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ + compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ + backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ + uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ + block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ + subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \ + lru_cache.o + +btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o +btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o +btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o +btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o +btrfs-$(CONFIG_FS_VERITY) += verity.o + +btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ + tests/extent-buffer-tests.o tests/btrfs-tests.o \ + tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ + tests/free-space-tree-tests.o tests/extent-map-tests.o diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c new file mode 100644 index 0000000000..206cf1612c --- /dev/null +++ b/fs/btrfs/accessors.c @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include +#include "messages.h" +#include "ctree.h" +#include "accessors.h" + +static bool check_setget_bounds(const struct extent_buffer *eb, + const void *ptr, unsigned off, int size) +{ + const unsigned long member_offset = (unsigned long)ptr + off; + + if (unlikely(member_offset + size > eb->len)) { + btrfs_warn(eb->fs_info, + "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d", + (member_offset > eb->len ? "start" : "end"), + (unsigned long)ptr, eb->start, member_offset, size); + return false; + } + + return true; +} + +void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb) +{ + token->eb = eb; + token->kaddr = page_address(eb->pages[0]); + token->offset = 0; +} + +/* + * Macro templates that define helpers to read/write extent buffer data of a + * given size, that are also used via ctree.h for access to item members by + * specialized helpers. + * + * Generic helpers: + * - btrfs_set_8 (for 8/16/32/64) + * - btrfs_get_8 (for 8/16/32/64) + * + * Generic helpers with a token (cached address of the most recently accessed + * page): + * - btrfs_set_token_8 (for 8/16/32/64) + * - btrfs_get_token_8 (for 8/16/32/64) + * + * The set/get functions handle data spanning two pages transparently, in case + * metadata block size is larger than page. Every pointer to metadata items is + * an offset into the extent buffer page array, cast to a specific type. This + * gives us all the type checking. + * + * The extent buffer pages stored in the array pages do not form a contiguous + * phyusical range, but the API functions assume the linear offset to the range + * from 0 to metadata node size. + */ + +#define DEFINE_BTRFS_SETGET_BITS(bits) \ +u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ + const void *ptr, unsigned long off) \ +{ \ + const unsigned long member_offset = (unsigned long)ptr + off; \ + const unsigned long idx = get_eb_page_index(member_offset); \ + const unsigned long oip = get_eb_offset_in_page(token->eb, \ + member_offset); \ + const int size = sizeof(u##bits); \ + u8 lebytes[sizeof(u##bits)]; \ + const int part = PAGE_SIZE - oip; \ + \ + ASSERT(token); \ + ASSERT(token->kaddr); \ + ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ + if (token->offset <= member_offset && \ + member_offset + size <= token->offset + PAGE_SIZE) { \ + return get_unaligned_le##bits(token->kaddr + oip); \ + } \ + token->kaddr = page_address(token->eb->pages[idx]); \ + token->offset = idx << PAGE_SHIFT; \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \ + return get_unaligned_le##bits(token->kaddr + oip); \ + \ + memcpy(lebytes, token->kaddr + oip, part); \ + token->kaddr = page_address(token->eb->pages[idx + 1]); \ + token->offset = (idx + 1) << PAGE_SHIFT; \ + memcpy(lebytes + part, token->kaddr, size - part); \ + return get_unaligned_le##bits(lebytes); \ +} \ +u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ + const void *ptr, unsigned long off) \ +{ \ + const unsigned long member_offset = (unsigned long)ptr + off; \ + const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ + const unsigned long idx = get_eb_page_index(member_offset); \ + char *kaddr = page_address(eb->pages[idx]); \ + const int size = sizeof(u##bits); \ + const int part = PAGE_SIZE - oip; \ + u8 lebytes[sizeof(u##bits)]; \ + \ + ASSERT(check_setget_bounds(eb, ptr, off, size)); \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \ + return get_unaligned_le##bits(kaddr + oip); \ + \ + memcpy(lebytes, kaddr + oip, part); \ + kaddr = page_address(eb->pages[idx + 1]); \ + memcpy(lebytes + part, kaddr, size - part); \ + return get_unaligned_le##bits(lebytes); \ +} \ +void btrfs_set_token_##bits(struct btrfs_map_token *token, \ + const void *ptr, unsigned long off, \ + u##bits val) \ +{ \ + const unsigned long member_offset = (unsigned long)ptr + off; \ + const unsigned long idx = get_eb_page_index(member_offset); \ + const unsigned long oip = get_eb_offset_in_page(token->eb, \ + member_offset); \ + const int size = sizeof(u##bits); \ + u8 lebytes[sizeof(u##bits)]; \ + const int part = PAGE_SIZE - oip; \ + \ + ASSERT(token); \ + ASSERT(token->kaddr); \ + ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ + if (token->offset <= member_offset && \ + member_offset + size <= token->offset + PAGE_SIZE) { \ + put_unaligned_le##bits(val, token->kaddr + oip); \ + return; \ + } \ + token->kaddr = page_address(token->eb->pages[idx]); \ + token->offset = idx << PAGE_SHIFT; \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ + put_unaligned_le##bits(val, token->kaddr + oip); \ + return; \ + } \ + put_unaligned_le##bits(val, lebytes); \ + memcpy(token->kaddr + oip, lebytes, part); \ + token->kaddr = page_address(token->eb->pages[idx + 1]); \ + token->offset = (idx + 1) << PAGE_SHIFT; \ + memcpy(token->kaddr, lebytes + part, size - part); \ +} \ +void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val) \ +{ \ + const unsigned long member_offset = (unsigned long)ptr + off; \ + const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ + const unsigned long idx = get_eb_page_index(member_offset); \ + char *kaddr = page_address(eb->pages[idx]); \ + const int size = sizeof(u##bits); \ + const int part = PAGE_SIZE - oip; \ + u8 lebytes[sizeof(u##bits)]; \ + \ + ASSERT(check_setget_bounds(eb, ptr, off, size)); \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ + put_unaligned_le##bits(val, kaddr + oip); \ + return; \ + } \ + \ + put_unaligned_le##bits(val, lebytes); \ + memcpy(kaddr + oip, lebytes, part); \ + kaddr = page_address(eb->pages[idx + 1]); \ + memcpy(kaddr, lebytes + part, size - part); \ +} + +DEFINE_BTRFS_SETGET_BITS(8) +DEFINE_BTRFS_SETGET_BITS(16) +DEFINE_BTRFS_SETGET_BITS(32) +DEFINE_BTRFS_SETGET_BITS(64) + +void btrfs_node_key(const struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr = btrfs_node_key_ptr_offset(eb, nr); + read_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h new file mode 100644 index 0000000000..8cfc821410 --- /dev/null +++ b/fs/btrfs/accessors.h @@ -0,0 +1,1072 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ACCESSORS_H +#define BTRFS_ACCESSORS_H + +#include + +struct btrfs_map_token { + struct extent_buffer *eb; + char *kaddr; + unsigned long offset; +}; + +void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb); + +/* + * Some macros to generate set/get functions for the struct fields. This + * assumes there is a lefoo_to_cpu for every type, so lets make a simple one + * for u8: + */ +#define le8_to_cpu(v) (v) +#define cpu_to_le8(v) (v) +#define __le8 u8 + +static inline u8 get_unaligned_le8(const void *p) +{ + return *(u8 *)p; +} + +static inline void put_unaligned_le8(u8 val, void *p) +{ + *(u8 *)p = val; +} + +#define read_eb_member(eb, ptr, type, member, result) (\ + read_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof_field(type, member))) + +#define write_eb_member(eb, ptr, type, member, result) (\ + write_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof_field(type, member))) + +#define DECLARE_BTRFS_SETGET_BITS(bits) \ +u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ + const void *ptr, unsigned long off); \ +void btrfs_set_token_##bits(struct btrfs_map_token *token, \ + const void *ptr, unsigned long off, \ + u##bits val); \ +u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ + const void *ptr, unsigned long off); \ +void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val); + +DECLARE_BTRFS_SETGET_BITS(8) +DECLARE_BTRFS_SETGET_BITS(16) +DECLARE_BTRFS_SETGET_BITS(32) +DECLARE_BTRFS_SETGET_BITS(64) + +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ + const type *s) \ +{ \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ + return btrfs_get_##bits(eb, s, offsetof(type, member)); \ +} \ +static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ + u##bits val) \ +{ \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ + btrfs_set_##bits(eb, s, offsetof(type, member), val); \ +} \ +static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ + const type *s) \ +{ \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ + return btrfs_get_token_##bits(token, s, offsetof(type, member));\ +} \ +static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ + type *s, u##bits val) \ +{ \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ + btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ +} + +#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ +{ \ + const type *p = page_address(eb->pages[0]) + \ + offset_in_page(eb->start); \ + return get_unaligned_le##bits(&p->member); \ +} \ +static inline void btrfs_set_##name(const struct extent_buffer *eb, \ + u##bits val) \ +{ \ + type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \ + put_unaligned_le##bits(val, &p->member); \ +} + +#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(const type *s) \ +{ \ + return get_unaligned_le##bits(&s->member); \ +} \ +static inline void btrfs_set_##name(type *s, u##bits val) \ +{ \ + put_unaligned_le##bits(val, &s->member); \ +} + +static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, + struct btrfs_dev_item *s) +{ + static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes)); + return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes)); +} +static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, + struct btrfs_dev_item *s, + u64 val) +{ + static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes)); + WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); + btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); +} + +BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); +BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); +BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); +BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, start_offset, 64); +BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); +BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); +BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); +BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); +BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, dev_group, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, + seek_speed, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, + bandwidth, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, + generation, 64); + +static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d) +{ + return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid); +} + +static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d) +{ + return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid); +} + +BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); +BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); +BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); +BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); +BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); +BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); +BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); + +static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) +{ + return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); +} + +BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, + stripe_len, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, + num_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, + sub_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); + +static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, int nr) +{ + unsigned long offset = (unsigned long)c; + + offset += offsetof(struct btrfs_chunk, stripe); + offset += nr * sizeof(struct btrfs_stripe); + return (struct btrfs_stripe *)offset; +} + +static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); +} + +static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); +} + +static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val); +} + +static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); +} + +static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val); +} + +/* struct btrfs_block_group_item */ +BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); + +BTRFS_SETGET_FUNCS(block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); +BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, + struct btrfs_block_group_item, flags, 64); + +/* struct btrfs_free_space_info */ +BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, + extent_count, 32); +BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); + +/* struct btrfs_inode_ref */ +BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); +BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_index, struct btrfs_inode_ref, index, 64); + +/* struct btrfs_inode_extref */ +BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, + parent_objectid, 64); +BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, + name_len, 16); +BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); + +/* struct btrfs_inode_item */ +BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); +BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); +BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); +BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); +BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); +BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, + sequence, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, + transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, nbytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, + block_group, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); +BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); + +/* struct btrfs_dev_extent */ +BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, + chunk_objectid, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, + chunk_offset, 64); +BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_tree, struct btrfs_dev_extent, + chunk_tree, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_objectid, struct btrfs_dev_extent, + chunk_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_offset, struct btrfs_dev_extent, + chunk_offset, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_length, struct btrfs_dev_extent, length, 64); + +BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); +BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64); +BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); + +BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); + +static inline void btrfs_tree_block_key(const struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, root, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, + objectid, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, + offset, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 32); + +BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32); + +BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, + type, 8); +BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, + offset, 64); + +static inline u32 btrfs_extent_inline_ref_size(int type) +{ + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + return sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_SHARED_DATA_REF_KEY) + return sizeof(struct btrfs_shared_data_ref) + + sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_EXTENT_DATA_REF_KEY) + return sizeof(struct btrfs_extent_data_ref) + + offsetof(struct btrfs_extent_inline_ref, offset); + return 0; +} + +/* struct btrfs_node */ +BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); +BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, blockptr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, + generation, 64); + +static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline unsigned long btrfs_node_key_ptr_offset(const struct extent_buffer *eb, int nr) +{ + return offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; +} + +void btrfs_node_key(const struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr); + +static inline void btrfs_set_node_key(const struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr; + + ptr = btrfs_node_key_ptr_offset(eb, nr); + write_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} + +/* struct btrfs_item */ +BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); + +static inline unsigned long btrfs_item_nr_offset(const struct extent_buffer *eb, int nr) +{ + return offsetof(struct btrfs_leaf, items) + + sizeof(struct btrfs_item) * nr; +} + +static inline struct btrfs_item *btrfs_item_nr(const struct extent_buffer *eb, int nr) +{ + return (struct btrfs_item *)btrfs_item_nr_offset(eb, nr); +} + +#define BTRFS_ITEM_SETGET_FUNCS(member) \ +static inline u32 btrfs_item_##member(const struct extent_buffer *eb, int slot) \ +{ \ + return btrfs_raw_item_##member(eb, btrfs_item_nr(eb, slot)); \ +} \ +static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ + int slot, u32 val) \ +{ \ + btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val); \ +} \ +static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ + int slot) \ +{ \ + struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ + return btrfs_token_raw_item_##member(token, item); \ +} \ +static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ + int slot, u32 val) \ +{ \ + struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ + btrfs_set_token_raw_item_##member(token, item, val); \ +} + +BTRFS_ITEM_SETGET_FUNCS(offset) +BTRFS_ITEM_SETGET_FUNCS(size); + +static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr) +{ + return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr); +} + +static inline void btrfs_item_key(const struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(eb, nr); + + read_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +static inline void btrfs_set_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(eb, nr); + + write_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); + +/* struct btrfs_root_ref */ +BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); +BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); +BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_root_ref_dirid, struct btrfs_root_ref, dirid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_root_ref_sequence, struct btrfs_root_ref, sequence, 64); +BTRFS_SETGET_STACK_FUNCS(stack_root_ref_name_len, struct btrfs_root_ref, name_len, 16); + +/* struct btrfs_dir_item */ +BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); +BTRFS_SETGET_FUNCS(dir_flags, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dir_flags, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, data_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64); + +static inline u8 btrfs_dir_ftype(const struct extent_buffer *eb, + const struct btrfs_dir_item *item) +{ + return btrfs_dir_flags_to_ftype(btrfs_dir_flags(eb, item)); +} + +static inline u8 btrfs_stack_dir_ftype(const struct btrfs_dir_item *item) +{ + return btrfs_dir_flags_to_ftype(btrfs_stack_dir_flags(item)); +} + +static inline void btrfs_dir_item_key(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + const struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, + num_entries, 64); +BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, + num_bitmaps, 64); +BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, + generation, 64); + +static inline void btrfs_free_space_key(const struct extent_buffer *eb, + const struct btrfs_free_space_header *h, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + +static inline void btrfs_set_free_space_key(struct extent_buffer *eb, + struct btrfs_free_space_header *h, + const struct btrfs_disk_key *key) +{ + write_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + +/* struct btrfs_disk_key */ +BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, objectid, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); + +#ifdef __LITTLE_ENDIAN + +/* + * Optimized helpers for little-endian architectures where CPU and on-disk + * structures have the same endianness and we can skip conversions. + */ + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key, + const struct btrfs_disk_key *disk_key) +{ + memcpy(cpu_key, disk_key, sizeof(struct btrfs_key)); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key, + const struct btrfs_key *cpu_key) +{ + memcpy(disk_key, cpu_key, sizeof(struct btrfs_key)); +} + +static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *cpu_key, int nr) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_node_key(eb, disk_key, nr); +} + +static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *cpu_key, int nr) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_item_key(eb, disk_key, nr); +} + +static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_key *cpu_key) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_dir_item_key(eb, item, disk_key); +} + +#else + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, + const struct btrfs_disk_key *disk) +{ + cpu->offset = le64_to_cpu(disk->offset); + cpu->type = disk->type; + cpu->objectid = le64_to_cpu(disk->objectid); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, + const struct btrfs_key *cpu) +{ + disk->offset = cpu_to_le64(cpu->offset); + disk->type = cpu->type; + disk->objectid = cpu_to_le64(cpu->objectid); +} + +static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + + btrfs_node_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + + btrfs_item_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_key *key) +{ + struct btrfs_disk_key disk_key; + + btrfs_dir_item_key(eb, item, &disk_key); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +#endif + +/* struct btrfs_header */ +BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); +BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, generation, 64); +BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); +BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); +BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); +BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, nritems, 32); +BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); + +static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag) +{ + return (btrfs_header_flags(eb) & flag) == flag; +} + +static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + + btrfs_set_header_flags(eb, flags | flag); +} + +static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + + btrfs_set_header_flags(eb, flags & ~flag); +} + +static inline int btrfs_header_backref_rev(const struct extent_buffer *eb) +{ + u64 flags = btrfs_header_flags(eb); + + return flags >> BTRFS_BACKREF_REV_SHIFT; +} + +static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, int rev) +{ + u64 flags = btrfs_header_flags(eb); + + flags &= ~BTRFS_BACKREF_REV_MASK; + flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; + btrfs_set_header_flags(eb, flags); +} + +static inline int btrfs_is_leaf(const struct extent_buffer *eb) +{ + return btrfs_header_level(eb) == 0; +} + +/* struct btrfs_root_item */ +BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, generation, 64); +BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); + +BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8); +BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); +BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); +BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); +BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, + last_snapshot, 64); +BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item, + generation_v2, 64); +BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, ctransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, otransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, stransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64); + +/* struct btrfs_root_backup */ +BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, + tree_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, + tree_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, + tree_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, + chunk_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, + chunk_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, + extent_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, + extent_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, + extent_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, + fs_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, + fs_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, + fs_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, + dev_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, + dev_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, + dev_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, + csum_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, + csum_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, + csum_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, + num_devices, 64); + +/* struct btrfs_balance_item */ +BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); + +static inline void btrfs_balance_data(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_set_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_balance_meta(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_set_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_balance_sys(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void btrfs_set_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, + const struct btrfs_disk_balance_args *disk) +{ + memset(cpu, 0, sizeof(*cpu)); + + cpu->profiles = le64_to_cpu(disk->profiles); + cpu->usage = le64_to_cpu(disk->usage); + cpu->devid = le64_to_cpu(disk->devid); + cpu->pstart = le64_to_cpu(disk->pstart); + cpu->pend = le64_to_cpu(disk->pend); + cpu->vstart = le64_to_cpu(disk->vstart); + cpu->vend = le64_to_cpu(disk->vend); + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); + cpu->limit = le64_to_cpu(disk->limit); + cpu->stripes_min = le32_to_cpu(disk->stripes_min); + cpu->stripes_max = le32_to_cpu(disk->stripes_max); +} + +static inline void btrfs_cpu_balance_args_to_disk( + struct btrfs_disk_balance_args *disk, + const struct btrfs_balance_args *cpu) +{ + memset(disk, 0, sizeof(*disk)); + + disk->profiles = cpu_to_le64(cpu->profiles); + disk->usage = cpu_to_le64(cpu->usage); + disk->devid = cpu_to_le64(cpu->devid); + disk->pstart = cpu_to_le64(cpu->pstart); + disk->pend = cpu_to_le64(cpu->pend); + disk->vstart = cpu_to_le64(cpu->vstart); + disk->vend = cpu_to_le64(cpu->vend); + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); + disk->limit = cpu_to_le64(cpu->limit); + disk->stripes_min = cpu_to_le32(cpu->stripes_min); + disk->stripes_max = cpu_to_le32(cpu->stripes_max); +} + +/* struct btrfs_super_block */ +BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); +BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, + struct btrfs_super_block, sys_chunk_array_size, 32); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, + struct btrfs_super_block, chunk_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, + root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, + chunk_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, + log_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, + sectorsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, + nodesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, + stripesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, + root_dir_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, + num_devices, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, + compat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, + compat_ro_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, + incompat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, + csum_type, 16); +BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, + cache_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); +BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, + uuid_tree_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block, + nr_global_roots, 64); + +/* struct btrfs_file_extent_item */ +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, + type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, + struct btrfs_file_extent_item, disk_bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, + struct btrfs_file_extent_item, offset, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, + struct btrfs_file_extent_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, + struct btrfs_file_extent_item, num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes, + struct btrfs_file_extent_item, ram_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, + struct btrfs_file_extent_item, disk_num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, + struct btrfs_file_extent_item, compression, 8); + + +BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); +BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, + disk_bytenr, 64); +BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, + disk_num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, + offset, 64); +BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, + num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, + ram_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, + compression, 8); +BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, + encryption, 8); +BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, + other_encoding, 16); + +/* btrfs_qgroup_status_item */ +BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, + version, 64); +BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, + flags, 64); +BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, + rescan, 64); + +/* btrfs_qgroup_info_item */ +BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, + rfer_cmpr, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, + excl_cmpr, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, + struct btrfs_qgroup_info_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, + rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, + struct btrfs_qgroup_info_item, rfer_cmpr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, + excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, + struct btrfs_qgroup_info_item, excl_cmpr, 64); + +/* btrfs_qgroup_limit_item */ +BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, flags, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, + max_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, + max_excl, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, + rsv_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, + rsv_excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_flags, + struct btrfs_qgroup_limit_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_rfer, + struct btrfs_qgroup_limit_item, max_rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_excl, + struct btrfs_qgroup_limit_item, max_excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_rfer, + struct btrfs_qgroup_limit_item, rsv_rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_excl, + struct btrfs_qgroup_limit_item, rsv_excl, 64); + +/* btrfs_dev_replace_item */ +BTRFS_SETGET_FUNCS(dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, + 64); +BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, + replace_state, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, + time_started, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, + time_stopped, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, + num_write_errors, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, num_uncorrectable_read_errors, + 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, + cursor_left, 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, + cursor_right, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, + cont_reading_from_srcdev_mode, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, + struct btrfs_dev_replace_item, replace_state, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, + struct btrfs_dev_replace_item, time_started, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, + struct btrfs_dev_replace_item, time_stopped, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, + struct btrfs_dev_replace_item, num_write_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, + num_uncorrectable_read_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, + struct btrfs_dev_replace_item, cursor_left, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, + struct btrfs_dev_replace_item, cursor_right, 64); + +/* btrfs_verity_descriptor_item */ +BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, + encryption, 8); +BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, + size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, + struct btrfs_verity_descriptor_item, encryption, 8); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, + struct btrfs_verity_descriptor_item, size, 64); + +/* Cast into the data area of the leaf. */ +#define btrfs_item_ptr(leaf, slot, type) \ + ((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot))) + +#define btrfs_item_ptr_offset(leaf, slot) \ + ((unsigned long)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot))) + +#endif diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c new file mode 100644 index 0000000000..7427449a04 --- /dev/null +++ b/fs/btrfs/acl.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "btrfs_inode.h" +#include "xattr.h" +#include "acl.h" + +struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) +{ + int size; + const char *name; + char *value = NULL; + struct posix_acl *acl; + + if (rcu) + return ERR_PTR(-ECHILD); + + switch (type) { + case ACL_TYPE_ACCESS: + name = XATTR_NAME_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = XATTR_NAME_POSIX_ACL_DEFAULT; + break; + default: + return ERR_PTR(-EINVAL); + } + + size = btrfs_getxattr(inode, name, NULL, 0); + if (size > 0) { + value = kzalloc(size, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + size = btrfs_getxattr(inode, name, value, size); + } + if (size > 0) + acl = posix_acl_from_xattr(&init_user_ns, value, size); + else if (size == -ENODATA || size == 0) + acl = NULL; + else + acl = ERR_PTR(size); + kfree(value); + + return acl; +} + +int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, + struct posix_acl *acl, int type) +{ + int ret, size = 0; + const char *name; + char *value = NULL; + + switch (type) { + case ACL_TYPE_ACCESS: + name = XATTR_NAME_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) + return acl ? -EINVAL : 0; + name = XATTR_NAME_POSIX_ACL_DEFAULT; + break; + default: + return -EINVAL; + } + + if (acl) { + unsigned int nofs_flag; + + size = posix_acl_xattr_size(acl->a_count); + /* + * We're holding a transaction handle, so use a NOFS memory + * allocation context to avoid deadlock if reclaim happens. + */ + nofs_flag = memalloc_nofs_save(); + value = kmalloc(size, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + if (!value) { + ret = -ENOMEM; + goto out; + } + + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (ret < 0) + goto out; + } + + if (trans) + ret = btrfs_setxattr(trans, inode, name, value, size, 0); + else + ret = btrfs_setxattr_trans(inode, name, value, size, 0); + +out: + kfree(value); + + if (!ret) + set_cached_acl(inode, type, acl); + + return ret; +} + +int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type) +{ + int ret; + struct inode *inode = d_inode(dentry); + umode_t old_mode = inode->i_mode; + + if (type == ACL_TYPE_ACCESS && acl) { + ret = posix_acl_update_mode(idmap, inode, + &inode->i_mode, &acl); + if (ret) + return ret; + } + ret = __btrfs_set_acl(NULL, inode, acl, type); + if (ret) + inode->i_mode = old_mode; + return ret; +} diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h new file mode 100644 index 0000000000..a270e71ec0 --- /dev/null +++ b/fs/btrfs/acl.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ACL_H +#define BTRFS_ACL_H + +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + +struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); +int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type); +int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, + struct posix_acl *acl, int type); + +#else + +#define btrfs_get_acl NULL +#define btrfs_set_acl NULL +static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct posix_acl *acl, + int type) +{ + return -EOPNOTSUPP; +} + +#endif + +#endif diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c new file mode 100644 index 0000000000..ce083e99ef --- /dev/null +++ b/fs/btrfs/async-thread.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2014 Fujitsu. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include "async-thread.h" +#include "ctree.h" + +enum { + WORK_DONE_BIT, + WORK_ORDER_DONE_BIT, +}; + +#define NO_THRESHOLD (-1) +#define DFT_THRESHOLD (32) + +struct btrfs_workqueue { + struct workqueue_struct *normal_wq; + + /* File system this workqueue services */ + struct btrfs_fs_info *fs_info; + + /* List head pointing to ordered work list */ + struct list_head ordered_list; + + /* Spinlock for ordered_list */ + spinlock_t list_lock; + + /* Thresholding related variants */ + atomic_t pending; + + /* Up limit of concurrency workers */ + int limit_active; + + /* Current number of concurrency workers */ + int current_active; + + /* Threshold to change current_active */ + int thresh; + unsigned int count; + spinlock_t thres_lock; +}; + +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq) +{ + return wq->fs_info; +} + +struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work) +{ + return work->wq->fs_info; +} + +bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) +{ + /* + * We could compare wq->pending with num_online_cpus() + * to support "thresh == NO_THRESHOLD" case, but it requires + * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's + * postpone it until someone needs the support of that case. + */ + if (wq->thresh == NO_THRESHOLD) + return false; + + return atomic_read(&wq->pending) > wq->thresh * 2; +} + +static void btrfs_init_workqueue(struct btrfs_workqueue *wq, + struct btrfs_fs_info *fs_info) +{ + wq->fs_info = fs_info; + atomic_set(&wq->pending, 0); + INIT_LIST_HEAD(&wq->ordered_list); + spin_lock_init(&wq->list_lock); + spin_lock_init(&wq->thres_lock); +} + +struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, + const char *name, unsigned int flags, + int limit_active, int thresh) +{ + struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); + + if (!ret) + return NULL; + + btrfs_init_workqueue(ret, fs_info); + + ret->limit_active = limit_active; + if (thresh == 0) + thresh = DFT_THRESHOLD; + /* For low threshold, disabling threshold is a better choice */ + if (thresh < DFT_THRESHOLD) { + ret->current_active = limit_active; + ret->thresh = NO_THRESHOLD; + } else { + /* + * For threshold-able wq, let its concurrency grow on demand. + * Use minimal max_active at alloc time to reduce resource + * usage. + */ + ret->current_active = 1; + ret->thresh = thresh; + } + + ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active, + name); + if (!ret->normal_wq) { + kfree(ret); + return NULL; + } + + trace_btrfs_workqueue_alloc(ret, name); + return ret; +} + +struct btrfs_workqueue *btrfs_alloc_ordered_workqueue( + struct btrfs_fs_info *fs_info, const char *name, + unsigned int flags) +{ + struct btrfs_workqueue *ret; + + ret = kzalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return NULL; + + btrfs_init_workqueue(ret, fs_info); + + /* Ordered workqueues don't allow @max_active adjustments. */ + ret->limit_active = 1; + ret->current_active = 1; + ret->thresh = NO_THRESHOLD; + + ret->normal_wq = alloc_ordered_workqueue("btrfs-%s", flags, name); + if (!ret->normal_wq) { + kfree(ret); + return NULL; + } + + trace_btrfs_workqueue_alloc(ret, name); + return ret; +} + +/* + * Hook for threshold which will be called in btrfs_queue_work. + * This hook WILL be called in IRQ handler context, + * so workqueue_set_max_active MUST NOT be called in this hook + */ +static inline void thresh_queue_hook(struct btrfs_workqueue *wq) +{ + if (wq->thresh == NO_THRESHOLD) + return; + atomic_inc(&wq->pending); +} + +/* + * Hook for threshold which will be called before executing the work, + * This hook is called in kthread content. + * So workqueue_set_max_active is called here. + */ +static inline void thresh_exec_hook(struct btrfs_workqueue *wq) +{ + int new_current_active; + long pending; + int need_change = 0; + + if (wq->thresh == NO_THRESHOLD) + return; + + atomic_dec(&wq->pending); + spin_lock(&wq->thres_lock); + /* + * Use wq->count to limit the calling frequency of + * workqueue_set_max_active. + */ + wq->count++; + wq->count %= (wq->thresh / 4); + if (!wq->count) + goto out; + new_current_active = wq->current_active; + + /* + * pending may be changed later, but it's OK since we really + * don't need it so accurate to calculate new_max_active. + */ + pending = atomic_read(&wq->pending); + if (pending > wq->thresh) + new_current_active++; + if (pending < wq->thresh / 2) + new_current_active--; + new_current_active = clamp_val(new_current_active, 1, wq->limit_active); + if (new_current_active != wq->current_active) { + need_change = 1; + wq->current_active = new_current_active; + } +out: + spin_unlock(&wq->thres_lock); + + if (need_change) { + workqueue_set_max_active(wq->normal_wq, wq->current_active); + } +} + +static void run_ordered_work(struct btrfs_workqueue *wq, + struct btrfs_work *self) +{ + struct list_head *list = &wq->ordered_list; + struct btrfs_work *work; + spinlock_t *lock = &wq->list_lock; + unsigned long flags; + bool free_self = false; + + while (1) { + spin_lock_irqsave(lock, flags); + if (list_empty(list)) + break; + work = list_entry(list->next, struct btrfs_work, + ordered_list); + if (!test_bit(WORK_DONE_BIT, &work->flags)) + break; + /* + * Orders all subsequent loads after reading WORK_DONE_BIT, + * paired with the smp_mb__before_atomic in btrfs_work_helper + * this guarantees that the ordered function will see all + * updates from ordinary work function. + */ + smp_rmb(); + + /* + * we are going to call the ordered done function, but + * we leave the work item on the list as a barrier so + * that later work items that are done don't have their + * functions called before this one returns + */ + if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) + break; + trace_btrfs_ordered_sched(work); + spin_unlock_irqrestore(lock, flags); + work->ordered_func(work); + + /* now take the lock again and drop our item from the list */ + spin_lock_irqsave(lock, flags); + list_del(&work->ordered_list); + spin_unlock_irqrestore(lock, flags); + + if (work == self) { + /* + * This is the work item that the worker is currently + * executing. + * + * The kernel workqueue code guarantees non-reentrancy + * of work items. I.e., if a work item with the same + * address and work function is queued twice, the second + * execution is blocked until the first one finishes. A + * work item may be freed and recycled with the same + * work function; the workqueue code assumes that the + * original work item cannot depend on the recycled work + * item in that case (see find_worker_executing_work()). + * + * Note that different types of Btrfs work can depend on + * each other, and one type of work on one Btrfs + * filesystem may even depend on the same type of work + * on another Btrfs filesystem via, e.g., a loop device. + * Therefore, we must not allow the current work item to + * be recycled until we are really done, otherwise we + * break the above assumption and can deadlock. + */ + free_self = true; + } else { + /* + * We don't want to call the ordered free functions with + * the lock held. + */ + work->ordered_free(work); + /* NB: work must not be dereferenced past this point. */ + trace_btrfs_all_work_done(wq->fs_info, work); + } + } + spin_unlock_irqrestore(lock, flags); + + if (free_self) { + self->ordered_free(self); + /* NB: self must not be dereferenced past this point. */ + trace_btrfs_all_work_done(wq->fs_info, self); + } +} + +static void btrfs_work_helper(struct work_struct *normal_work) +{ + struct btrfs_work *work = container_of(normal_work, struct btrfs_work, + normal_work); + struct btrfs_workqueue *wq = work->wq; + int need_order = 0; + + /* + * We should not touch things inside work in the following cases: + * 1) after work->func() if it has no ordered_free + * Since the struct is freed in work->func(). + * 2) after setting WORK_DONE_BIT + * The work may be freed in other threads almost instantly. + * So we save the needed things here. + */ + if (work->ordered_func) + need_order = 1; + + trace_btrfs_work_sched(work); + thresh_exec_hook(wq); + work->func(work); + if (need_order) { + /* + * Ensures all memory accesses done in the work function are + * ordered before setting the WORK_DONE_BIT. Ensuring the thread + * which is going to executed the ordered work sees them. + * Pairs with the smp_rmb in run_ordered_work. + */ + smp_mb__before_atomic(); + set_bit(WORK_DONE_BIT, &work->flags); + run_ordered_work(wq, work); + } else { + /* NB: work must not be dereferenced past this point. */ + trace_btrfs_all_work_done(wq->fs_info, work); + } +} + +void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, + btrfs_func_t ordered_func, btrfs_func_t ordered_free) +{ + work->func = func; + work->ordered_func = ordered_func; + work->ordered_free = ordered_free; + INIT_WORK(&work->normal_work, btrfs_work_helper); + INIT_LIST_HEAD(&work->ordered_list); + work->flags = 0; +} + +void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work) +{ + unsigned long flags; + + work->wq = wq; + thresh_queue_hook(wq); + if (work->ordered_func) { + spin_lock_irqsave(&wq->list_lock, flags); + list_add_tail(&work->ordered_list, &wq->ordered_list); + spin_unlock_irqrestore(&wq->list_lock, flags); + } + trace_btrfs_work_queued(work); + queue_work(wq->normal_wq, &work->normal_work); +} + +void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) +{ + if (!wq) + return; + destroy_workqueue(wq->normal_wq); + trace_btrfs_workqueue_destroy(wq); + kfree(wq); +} + +void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active) +{ + if (wq) + wq->limit_active = limit_active; +} + +void btrfs_flush_workqueue(struct btrfs_workqueue *wq) +{ + flush_workqueue(wq->normal_wq); +} diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h new file mode 100644 index 0000000000..30f66c5e2e --- /dev/null +++ b/fs/btrfs/async-thread.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2014 Fujitsu. All rights reserved. + */ + +#ifndef BTRFS_ASYNC_THREAD_H +#define BTRFS_ASYNC_THREAD_H + +#include + +struct btrfs_fs_info; +struct btrfs_workqueue; +struct btrfs_work; +typedef void (*btrfs_func_t)(struct btrfs_work *arg); + +struct btrfs_work { + btrfs_func_t func; + btrfs_func_t ordered_func; + btrfs_func_t ordered_free; + + /* Don't touch things below */ + struct work_struct normal_work; + struct list_head ordered_list; + struct btrfs_workqueue *wq; + unsigned long flags; +}; + +struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, + const char *name, + unsigned int flags, + int limit_active, + int thresh); +struct btrfs_workqueue *btrfs_alloc_ordered_workqueue( + struct btrfs_fs_info *fs_info, const char *name, + unsigned int flags); +void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, + btrfs_func_t ordered_func, btrfs_func_t ordered_free); +void btrfs_queue_work(struct btrfs_workqueue *wq, + struct btrfs_work *work); +void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); +void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); +struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work); +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq); +bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); +void btrfs_flush_workqueue(struct btrfs_workqueue *wq); + +#endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c new file mode 100644 index 0000000000..a4a809efc9 --- /dev/null +++ b/fs/btrfs/backref.c @@ -0,0 +1,3648 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2011 STRATO. All rights reserved. + */ + +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "backref.h" +#include "ulist.h" +#include "transaction.h" +#include "delayed-ref.h" +#include "locking.h" +#include "misc.h" +#include "tree-mod-log.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "relocation.h" +#include "tree-checker.h" + +/* Just arbitrary numbers so we can be sure one of these happened. */ +#define BACKREF_FOUND_SHARED 6 +#define BACKREF_FOUND_NOT_SHARED 7 + +struct extent_inode_elem { + u64 inum; + u64 offset; + u64 num_bytes; + struct extent_inode_elem *next; +}; + +static int check_extent_in_eb(struct btrfs_backref_walk_ctx *ctx, + const struct btrfs_key *key, + const struct extent_buffer *eb, + const struct btrfs_file_extent_item *fi, + struct extent_inode_elem **eie) +{ + const u64 data_len = btrfs_file_extent_num_bytes(eb, fi); + u64 offset = key->offset; + struct extent_inode_elem *e; + const u64 *root_ids; + int root_count; + bool cached; + + if (!ctx->ignore_extent_item_pos && + !btrfs_file_extent_compression(eb, fi) && + !btrfs_file_extent_encryption(eb, fi) && + !btrfs_file_extent_other_encoding(eb, fi)) { + u64 data_offset; + + data_offset = btrfs_file_extent_offset(eb, fi); + + if (ctx->extent_item_pos < data_offset || + ctx->extent_item_pos >= data_offset + data_len) + return 1; + offset += ctx->extent_item_pos - data_offset; + } + + if (!ctx->indirect_ref_iterator || !ctx->cache_lookup) + goto add_inode_elem; + + cached = ctx->cache_lookup(eb->start, ctx->user_ctx, &root_ids, + &root_count); + if (!cached) + goto add_inode_elem; + + for (int i = 0; i < root_count; i++) { + int ret; + + ret = ctx->indirect_ref_iterator(key->objectid, offset, + data_len, root_ids[i], + ctx->user_ctx); + if (ret) + return ret; + } + +add_inode_elem: + e = kmalloc(sizeof(*e), GFP_NOFS); + if (!e) + return -ENOMEM; + + e->next = *eie; + e->inum = key->objectid; + e->offset = offset; + e->num_bytes = data_len; + *eie = e; + + return 0; +} + +static void free_inode_elem_list(struct extent_inode_elem *eie) +{ + struct extent_inode_elem *eie_next; + + for (; eie; eie = eie_next) { + eie_next = eie->next; + kfree(eie); + } +} + +static int find_extent_in_eb(struct btrfs_backref_walk_ctx *ctx, + const struct extent_buffer *eb, + struct extent_inode_elem **eie) +{ + u64 disk_byte; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int slot; + int nritems; + int extent_type; + int ret; + + /* + * from the shared data ref, we only have the leaf but we need + * the key. thus, we must look into all items and see that we + * find one (some) with a reference to our extent item. + */ + nritems = btrfs_header_nritems(eb); + for (slot = 0; slot < nritems; ++slot) { + btrfs_item_key_to_cpu(eb, &key, slot); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte != ctx->bytenr) + continue; + + ret = check_extent_in_eb(ctx, &key, eb, fi, eie); + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) + return ret; + } + + return 0; +} + +struct preftree { + struct rb_root_cached root; + unsigned int count; +}; + +#define PREFTREE_INIT { .root = RB_ROOT_CACHED, .count = 0 } + +struct preftrees { + struct preftree direct; /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */ + struct preftree indirect; /* BTRFS_[TREE_BLOCK|EXTENT_DATA]_REF_KEY */ + struct preftree indirect_missing_keys; +}; + +/* + * Checks for a shared extent during backref search. + * + * The share_count tracks prelim_refs (direct and indirect) having a + * ref->count >0: + * - incremented when a ref->count transitions to >0 + * - decremented when a ref->count transitions to <1 + */ +struct share_check { + struct btrfs_backref_share_check_ctx *ctx; + struct btrfs_root *root; + u64 inum; + u64 data_bytenr; + u64 data_extent_gen; + /* + * Counts number of inodes that refer to an extent (different inodes in + * the same root or different roots) that we could find. The sharedness + * check typically stops once this counter gets greater than 1, so it + * may not reflect the total number of inodes. + */ + int share_count; + /* + * The number of times we found our inode refers to the data extent we + * are determining the sharedness. In other words, how many file extent + * items we could find for our inode that point to our target data + * extent. The value we get here after finishing the extent sharedness + * check may be smaller than reality, but if it ends up being greater + * than 1, then we know for sure the inode has multiple file extent + * items that point to our inode, and we can safely assume it's useful + * to cache the sharedness check result. + */ + int self_ref_count; + bool have_delayed_delete_refs; +}; + +static inline int extent_is_shared(struct share_check *sc) +{ + return (sc && sc->share_count > 1) ? BACKREF_FOUND_SHARED : 0; +} + +static struct kmem_cache *btrfs_prelim_ref_cache; + +int __init btrfs_prelim_ref_init(void) +{ + btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref", + sizeof(struct prelim_ref), + 0, + SLAB_MEM_SPREAD, + NULL); + if (!btrfs_prelim_ref_cache) + return -ENOMEM; + return 0; +} + +void __cold btrfs_prelim_ref_exit(void) +{ + kmem_cache_destroy(btrfs_prelim_ref_cache); +} + +static void free_pref(struct prelim_ref *ref) +{ + kmem_cache_free(btrfs_prelim_ref_cache, ref); +} + +/* + * Return 0 when both refs are for the same block (and can be merged). + * A -1 return indicates ref1 is a 'lower' block than ref2, while 1 + * indicates a 'higher' block. + */ +static int prelim_ref_compare(struct prelim_ref *ref1, + struct prelim_ref *ref2) +{ + if (ref1->level < ref2->level) + return -1; + if (ref1->level > ref2->level) + return 1; + if (ref1->root_id < ref2->root_id) + return -1; + if (ref1->root_id > ref2->root_id) + return 1; + if (ref1->key_for_search.type < ref2->key_for_search.type) + return -1; + if (ref1->key_for_search.type > ref2->key_for_search.type) + return 1; + if (ref1->key_for_search.objectid < ref2->key_for_search.objectid) + return -1; + if (ref1->key_for_search.objectid > ref2->key_for_search.objectid) + return 1; + if (ref1->key_for_search.offset < ref2->key_for_search.offset) + return -1; + if (ref1->key_for_search.offset > ref2->key_for_search.offset) + return 1; + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + + return 0; +} + +static void update_share_count(struct share_check *sc, int oldcount, + int newcount, struct prelim_ref *newref) +{ + if ((!sc) || (oldcount == 0 && newcount < 1)) + return; + + if (oldcount > 0 && newcount < 1) + sc->share_count--; + else if (oldcount < 1 && newcount > 0) + sc->share_count++; + + if (newref->root_id == sc->root->root_key.objectid && + newref->wanted_disk_byte == sc->data_bytenr && + newref->key_for_search.objectid == sc->inum) + sc->self_ref_count += newref->count; +} + +/* + * Add @newref to the @root rbtree, merging identical refs. + * + * Callers should assume that newref has been freed after calling. + */ +static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, + struct preftree *preftree, + struct prelim_ref *newref, + struct share_check *sc) +{ + struct rb_root_cached *root; + struct rb_node **p; + struct rb_node *parent = NULL; + struct prelim_ref *ref; + int result; + bool leftmost = true; + + root = &preftree->root; + p = &root->rb_root.rb_node; + + while (*p) { + parent = *p; + ref = rb_entry(parent, struct prelim_ref, rbnode); + result = prelim_ref_compare(ref, newref); + if (result < 0) { + p = &(*p)->rb_left; + } else if (result > 0) { + p = &(*p)->rb_right; + leftmost = false; + } else { + /* Identical refs, merge them and free @newref */ + struct extent_inode_elem *eie = ref->inode_list; + + while (eie && eie->next) + eie = eie->next; + + if (!eie) + ref->inode_list = newref->inode_list; + else + eie->next = newref->inode_list; + trace_btrfs_prelim_ref_merge(fs_info, ref, newref, + preftree->count); + /* + * A delayed ref can have newref->count < 0. + * The ref->count is updated to follow any + * BTRFS_[ADD|DROP]_DELAYED_REF actions. + */ + update_share_count(sc, ref->count, + ref->count + newref->count, newref); + ref->count += newref->count; + free_pref(newref); + return; + } + } + + update_share_count(sc, 0, newref->count, newref); + preftree->count++; + trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count); + rb_link_node(&newref->rbnode, parent, p); + rb_insert_color_cached(&newref->rbnode, root, leftmost); +} + +/* + * Release the entire tree. We don't care about internal consistency so + * just free everything and then reset the tree root. + */ +static void prelim_release(struct preftree *preftree) +{ + struct prelim_ref *ref, *next_ref; + + rbtree_postorder_for_each_entry_safe(ref, next_ref, + &preftree->root.rb_root, rbnode) { + free_inode_elem_list(ref->inode_list); + free_pref(ref); + } + + preftree->root = RB_ROOT_CACHED; + preftree->count = 0; +} + +/* + * the rules for all callers of this function are: + * - obtaining the parent is the goal + * - if you add a key, you must know that it is a correct key + * - if you cannot add the parent or a correct key, then we will look into the + * block later to set a correct key + * + * delayed refs + * ============ + * backref type | shared | indirect | shared | indirect + * information | tree | tree | data | data + * --------------------+--------+----------+--------+---------- + * parent logical | y | - | - | - + * key to resolve | - | y | y | y + * tree block logical | - | - | - | - + * root for resolving | y | y | y | y + * + * - column 1: we've the parent -> done + * - column 2, 3, 4: we use the key to find the parent + * + * on disk refs (inline or keyed) + * ============================== + * backref type | shared | indirect | shared | indirect + * information | tree | tree | data | data + * --------------------+--------+----------+--------+---------- + * parent logical | y | - | y | - + * key to resolve | - | - | - | y + * tree block logical | y | y | y | y + * root for resolving | - | y | y | y + * + * - column 1, 3: we've the parent -> done + * - column 2: we take the first key from the block to find the parent + * (see add_missing_keys) + * - column 4: we use the key to find the parent + * + * additional information that's available but not required to find the parent + * block might help in merging entries to gain some speed. + */ +static int add_prelim_ref(const struct btrfs_fs_info *fs_info, + struct preftree *preftree, u64 root_id, + const struct btrfs_key *key, int level, u64 parent, + u64 wanted_disk_byte, int count, + struct share_check *sc, gfp_t gfp_mask) +{ + struct prelim_ref *ref; + + if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID) + return 0; + + ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask); + if (!ref) + return -ENOMEM; + + ref->root_id = root_id; + if (key) + ref->key_for_search = *key; + else + memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); + + ref->inode_list = NULL; + ref->level = level; + ref->count = count; + ref->parent = parent; + ref->wanted_disk_byte = wanted_disk_byte; + prelim_ref_insert(fs_info, preftree, ref, sc); + return extent_is_shared(sc); +} + +/* direct refs use root == 0, key == NULL */ +static int add_direct_ref(const struct btrfs_fs_info *fs_info, + struct preftrees *preftrees, int level, u64 parent, + u64 wanted_disk_byte, int count, + struct share_check *sc, gfp_t gfp_mask) +{ + return add_prelim_ref(fs_info, &preftrees->direct, 0, NULL, level, + parent, wanted_disk_byte, count, sc, gfp_mask); +} + +/* indirect refs use parent == 0 */ +static int add_indirect_ref(const struct btrfs_fs_info *fs_info, + struct preftrees *preftrees, u64 root_id, + const struct btrfs_key *key, int level, + u64 wanted_disk_byte, int count, + struct share_check *sc, gfp_t gfp_mask) +{ + struct preftree *tree = &preftrees->indirect; + + if (!key) + tree = &preftrees->indirect_missing_keys; + return add_prelim_ref(fs_info, tree, root_id, key, level, 0, + wanted_disk_byte, count, sc, gfp_mask); +} + +static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr) +{ + struct rb_node **p = &preftrees->direct.root.rb_root.rb_node; + struct rb_node *parent = NULL; + struct prelim_ref *ref = NULL; + struct prelim_ref target = {}; + int result; + + target.parent = bytenr; + + while (*p) { + parent = *p; + ref = rb_entry(parent, struct prelim_ref, rbnode); + result = prelim_ref_compare(ref, &target); + + if (result < 0) + p = &(*p)->rb_left; + else if (result > 0) + p = &(*p)->rb_right; + else + return 1; + } + return 0; +} + +static int add_all_parents(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_root *root, struct btrfs_path *path, + struct ulist *parents, + struct preftrees *preftrees, struct prelim_ref *ref, + int level) +{ + int ret = 0; + int slot; + struct extent_buffer *eb; + struct btrfs_key key; + struct btrfs_key *key_for_search = &ref->key_for_search; + struct btrfs_file_extent_item *fi; + struct extent_inode_elem *eie = NULL, *old = NULL; + u64 disk_byte; + u64 wanted_disk_byte = ref->wanted_disk_byte; + u64 count = 0; + u64 data_offset; + u8 type; + + if (level != 0) { + eb = path->nodes[level]; + ret = ulist_add(parents, eb->start, 0, GFP_NOFS); + if (ret < 0) + return ret; + return 0; + } + + /* + * 1. We normally enter this function with the path already pointing to + * the first item to check. But sometimes, we may enter it with + * slot == nritems. + * 2. We are searching for normal backref but bytenr of this leaf + * matches shared data backref + * 3. The leaf owner is not equal to the root we are searching + * + * For these cases, go to the next leaf before we continue. + */ + eb = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(eb) || + is_shared_data_backref(preftrees, eb->start) || + ref->root_id != btrfs_header_owner(eb)) { + if (ctx->time_seq == BTRFS_SEQ_LAST) + ret = btrfs_next_leaf(root, path); + else + ret = btrfs_next_old_leaf(root, path, ctx->time_seq); + } + + while (!ret && count < ref->count) { + eb = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(eb, &key, slot); + + if (key.objectid != key_for_search->objectid || + key.type != BTRFS_EXTENT_DATA_KEY) + break; + + /* + * We are searching for normal backref but bytenr of this leaf + * matches shared data backref, OR + * the leaf owner is not equal to the root we are searching for + */ + if (slot == 0 && + (is_shared_data_backref(preftrees, eb->start) || + ref->root_id != btrfs_header_owner(eb))) { + if (ctx->time_seq == BTRFS_SEQ_LAST) + ret = btrfs_next_leaf(root, path); + else + ret = btrfs_next_old_leaf(root, path, ctx->time_seq); + continue; + } + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + type = btrfs_file_extent_type(eb, fi); + if (type == BTRFS_FILE_EXTENT_INLINE) + goto next; + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + data_offset = btrfs_file_extent_offset(eb, fi); + + if (disk_byte == wanted_disk_byte) { + eie = NULL; + old = NULL; + if (ref->key_for_search.offset == key.offset - data_offset) + count++; + else + goto next; + if (!ctx->skip_inode_ref_list) { + ret = check_extent_in_eb(ctx, &key, eb, fi, &eie); + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || + ret < 0) + break; + } + if (ret > 0) + goto next; + ret = ulist_add_merge_ptr(parents, eb->start, + eie, (void **)&old, GFP_NOFS); + if (ret < 0) + break; + if (!ret && !ctx->skip_inode_ref_list) { + while (old->next) + old = old->next; + old->next = eie; + } + eie = NULL; + } +next: + if (ctx->time_seq == BTRFS_SEQ_LAST) + ret = btrfs_next_item(root, path); + else + ret = btrfs_next_old_item(root, path, ctx->time_seq); + } + + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) + free_inode_elem_list(eie); + else if (ret > 0) + ret = 0; + + return ret; +} + +/* + * resolve an indirect backref in the form (root_id, key, level) + * to a logical address + */ +static int resolve_indirect_ref(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_path *path, + struct preftrees *preftrees, + struct prelim_ref *ref, struct ulist *parents) +{ + struct btrfs_root *root; + struct extent_buffer *eb; + int ret = 0; + int root_level; + int level = ref->level; + struct btrfs_key search_key = ref->key_for_search; + + /* + * If we're search_commit_root we could possibly be holding locks on + * other tree nodes. This happens when qgroups does backref walks when + * adding new delayed refs. To deal with this we need to look in cache + * for the root, and if we don't find it then we need to search the + * tree_root's commit root, thus the btrfs_get_fs_root_commit_root usage + * here. + */ + if (path->search_commit_root) + root = btrfs_get_fs_root_commit_root(ctx->fs_info, path, ref->root_id); + else + root = btrfs_get_fs_root(ctx->fs_info, ref->root_id, false); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out_free; + } + + if (!path->search_commit_root && + test_bit(BTRFS_ROOT_DELETING, &root->state)) { + ret = -ENOENT; + goto out; + } + + if (btrfs_is_testing(ctx->fs_info)) { + ret = -ENOENT; + goto out; + } + + if (path->search_commit_root) + root_level = btrfs_header_level(root->commit_root); + else if (ctx->time_seq == BTRFS_SEQ_LAST) + root_level = btrfs_header_level(root->node); + else + root_level = btrfs_old_root_level(root, ctx->time_seq); + + if (root_level + 1 == level) + goto out; + + /* + * We can often find data backrefs with an offset that is too large + * (>= LLONG_MAX, maximum allowed file offset) due to underflows when + * subtracting a file's offset with the data offset of its + * corresponding extent data item. This can happen for example in the + * clone ioctl. + * + * So if we detect such case we set the search key's offset to zero to + * make sure we will find the matching file extent item at + * add_all_parents(), otherwise we will miss it because the offset + * taken form the backref is much larger then the offset of the file + * extent item. This can make us scan a very large number of file + * extent items, but at least it will not make us miss any. + * + * This is an ugly workaround for a behaviour that should have never + * existed, but it does and a fix for the clone ioctl would touch a lot + * of places, cause backwards incompatibility and would not fix the + * problem for extents cloned with older kernels. + */ + if (search_key.type == BTRFS_EXTENT_DATA_KEY && + search_key.offset >= LLONG_MAX) + search_key.offset = 0; + path->lowest_level = level; + if (ctx->time_seq == BTRFS_SEQ_LAST) + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + else + ret = btrfs_search_old_slot(root, &search_key, path, ctx->time_seq); + + btrfs_debug(ctx->fs_info, + "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)", + ref->root_id, level, ref->count, ret, + ref->key_for_search.objectid, ref->key_for_search.type, + ref->key_for_search.offset); + if (ret < 0) + goto out; + + eb = path->nodes[level]; + while (!eb) { + if (WARN_ON(!level)) { + ret = 1; + goto out; + } + level--; + eb = path->nodes[level]; + } + + ret = add_all_parents(ctx, root, path, parents, preftrees, ref, level); +out: + btrfs_put_root(root); +out_free: + path->lowest_level = 0; + btrfs_release_path(path); + return ret; +} + +static struct extent_inode_elem * +unode_aux_to_inode_list(struct ulist_node *node) +{ + if (!node) + return NULL; + return (struct extent_inode_elem *)(uintptr_t)node->aux; +} + +static void free_leaf_list(struct ulist *ulist) +{ + struct ulist_node *node; + struct ulist_iterator uiter; + + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(ulist, &uiter))) + free_inode_elem_list(unode_aux_to_inode_list(node)); + + ulist_free(ulist); +} + +/* + * We maintain three separate rbtrees: one for direct refs, one for + * indirect refs which have a key, and one for indirect refs which do not + * have a key. Each tree does merge on insertion. + * + * Once all of the references are located, we iterate over the tree of + * indirect refs with missing keys. An appropriate key is located and + * the ref is moved onto the tree for indirect refs. After all missing + * keys are thus located, we iterate over the indirect ref tree, resolve + * each reference, and then insert the resolved reference onto the + * direct tree (merging there too). + * + * New backrefs (i.e., for parent nodes) are added to the appropriate + * rbtree as they are encountered. The new backrefs are subsequently + * resolved as above. + */ +static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_path *path, + struct preftrees *preftrees, + struct share_check *sc) +{ + int err; + int ret = 0; + struct ulist *parents; + struct ulist_node *node; + struct ulist_iterator uiter; + struct rb_node *rnode; + + parents = ulist_alloc(GFP_NOFS); + if (!parents) + return -ENOMEM; + + /* + * We could trade memory usage for performance here by iterating + * the tree, allocating new refs for each insertion, and then + * freeing the entire indirect tree when we're done. In some test + * cases, the tree can grow quite large (~200k objects). + */ + while ((rnode = rb_first_cached(&preftrees->indirect.root))) { + struct prelim_ref *ref; + + ref = rb_entry(rnode, struct prelim_ref, rbnode); + if (WARN(ref->parent, + "BUG: direct ref found in indirect tree")) { + ret = -EINVAL; + goto out; + } + + rb_erase_cached(&ref->rbnode, &preftrees->indirect.root); + preftrees->indirect.count--; + + if (ref->count == 0) { + free_pref(ref); + continue; + } + + if (sc && ref->root_id != sc->root->root_key.objectid) { + free_pref(ref); + ret = BACKREF_FOUND_SHARED; + goto out; + } + err = resolve_indirect_ref(ctx, path, preftrees, ref, parents); + /* + * we can only tolerate ENOENT,otherwise,we should catch error + * and return directly. + */ + if (err == -ENOENT) { + prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, + NULL); + continue; + } else if (err) { + free_pref(ref); + ret = err; + goto out; + } + + /* we put the first parent into the ref at hand */ + ULIST_ITER_INIT(&uiter); + node = ulist_next(parents, &uiter); + ref->parent = node ? node->val : 0; + ref->inode_list = unode_aux_to_inode_list(node); + + /* Add a prelim_ref(s) for any other parent(s). */ + while ((node = ulist_next(parents, &uiter))) { + struct prelim_ref *new_ref; + + new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache, + GFP_NOFS); + if (!new_ref) { + free_pref(ref); + ret = -ENOMEM; + goto out; + } + memcpy(new_ref, ref, sizeof(*ref)); + new_ref->parent = node->val; + new_ref->inode_list = unode_aux_to_inode_list(node); + prelim_ref_insert(ctx->fs_info, &preftrees->direct, + new_ref, NULL); + } + + /* + * Now it's a direct ref, put it in the direct tree. We must + * do this last because the ref could be merged/freed here. + */ + prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, NULL); + + ulist_reinit(parents); + cond_resched(); + } +out: + /* + * We may have inode lists attached to refs in the parents ulist, so we + * must free them before freeing the ulist and its refs. + */ + free_leaf_list(parents); + return ret; +} + +/* + * read tree blocks and add keys where required. + */ +static int add_missing_keys(struct btrfs_fs_info *fs_info, + struct preftrees *preftrees, bool lock) +{ + struct prelim_ref *ref; + struct extent_buffer *eb; + struct preftree *tree = &preftrees->indirect_missing_keys; + struct rb_node *node; + + while ((node = rb_first_cached(&tree->root))) { + struct btrfs_tree_parent_check check = { 0 }; + + ref = rb_entry(node, struct prelim_ref, rbnode); + rb_erase_cached(node, &tree->root); + + BUG_ON(ref->parent); /* should not be a direct ref */ + BUG_ON(ref->key_for_search.type); + BUG_ON(!ref->wanted_disk_byte); + + check.level = ref->level - 1; + check.owner_root = ref->root_id; + + eb = read_tree_block(fs_info, ref->wanted_disk_byte, &check); + if (IS_ERR(eb)) { + free_pref(ref); + return PTR_ERR(eb); + } + if (!extent_buffer_uptodate(eb)) { + free_pref(ref); + free_extent_buffer(eb); + return -EIO; + } + + if (lock) + btrfs_tree_read_lock(eb); + if (btrfs_header_level(eb) == 0) + btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); + else + btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); + if (lock) + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL); + cond_resched(); + } + return 0; +} + +/* + * add all currently queued delayed refs from this head whose seq nr is + * smaller or equal that seq to the list + */ +static int add_delayed_refs(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_head *head, u64 seq, + struct preftrees *preftrees, struct share_check *sc) +{ + struct btrfs_delayed_ref_node *node; + struct btrfs_key key; + struct rb_node *n; + int count; + int ret = 0; + + spin_lock(&head->lock); + for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) { + node = rb_entry(n, struct btrfs_delayed_ref_node, + ref_node); + if (node->seq > seq) + continue; + + switch (node->action) { + case BTRFS_ADD_DELAYED_EXTENT: + case BTRFS_UPDATE_DELAYED_HEAD: + WARN_ON(1); + continue; + case BTRFS_ADD_DELAYED_REF: + count = node->ref_mod; + break; + case BTRFS_DROP_DELAYED_REF: + count = node->ref_mod * -1; + break; + default: + BUG(); + } + switch (node->type) { + case BTRFS_TREE_BLOCK_REF_KEY: { + /* NORMAL INDIRECT METADATA backref */ + struct btrfs_delayed_tree_ref *ref; + struct btrfs_key *key_ptr = NULL; + + if (head->extent_op && head->extent_op->update_key) { + btrfs_disk_key_to_cpu(&key, &head->extent_op->key); + key_ptr = &key; + } + + ref = btrfs_delayed_node_to_tree_ref(node); + ret = add_indirect_ref(fs_info, preftrees, ref->root, + key_ptr, ref->level + 1, + node->bytenr, count, sc, + GFP_ATOMIC); + break; + } + case BTRFS_SHARED_BLOCK_REF_KEY: { + /* SHARED DIRECT METADATA backref */ + struct btrfs_delayed_tree_ref *ref; + + ref = btrfs_delayed_node_to_tree_ref(node); + + ret = add_direct_ref(fs_info, preftrees, ref->level + 1, + ref->parent, node->bytenr, count, + sc, GFP_ATOMIC); + break; + } + case BTRFS_EXTENT_DATA_REF_KEY: { + /* NORMAL INDIRECT DATA backref */ + struct btrfs_delayed_data_ref *ref; + ref = btrfs_delayed_node_to_data_ref(node); + + key.objectid = ref->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = ref->offset; + + /* + * If we have a share check context and a reference for + * another inode, we can't exit immediately. This is + * because even if this is a BTRFS_ADD_DELAYED_REF + * reference we may find next a BTRFS_DROP_DELAYED_REF + * which cancels out this ADD reference. + * + * If this is a DROP reference and there was no previous + * ADD reference, then we need to signal that when we + * process references from the extent tree (through + * add_inline_refs() and add_keyed_refs()), we should + * not exit early if we find a reference for another + * inode, because one of the delayed DROP references + * may cancel that reference in the extent tree. + */ + if (sc && count < 0) + sc->have_delayed_delete_refs = true; + + ret = add_indirect_ref(fs_info, preftrees, ref->root, + &key, 0, node->bytenr, count, sc, + GFP_ATOMIC); + break; + } + case BTRFS_SHARED_DATA_REF_KEY: { + /* SHARED DIRECT FULL backref */ + struct btrfs_delayed_data_ref *ref; + + ref = btrfs_delayed_node_to_data_ref(node); + + ret = add_direct_ref(fs_info, preftrees, 0, ref->parent, + node->bytenr, count, sc, + GFP_ATOMIC); + break; + } + default: + WARN_ON(1); + } + /* + * We must ignore BACKREF_FOUND_SHARED until all delayed + * refs have been checked. + */ + if (ret && (ret != BACKREF_FOUND_SHARED)) + break; + } + if (!ret) + ret = extent_is_shared(sc); + + spin_unlock(&head->lock); + return ret; +} + +/* + * add all inline backrefs for bytenr to the list + * + * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. + */ +static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_path *path, + int *info_level, struct preftrees *preftrees, + struct share_check *sc) +{ + int ret = 0; + int slot; + struct extent_buffer *leaf; + struct btrfs_key key; + struct btrfs_key found_key; + unsigned long ptr; + unsigned long end; + struct btrfs_extent_item *ei; + u64 flags; + u64 item_size; + + /* + * enumerate all inline refs + */ + leaf = path->nodes[0]; + slot = path->slots[0]; + + item_size = btrfs_item_size(leaf, slot); + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + + if (ctx->check_extent_item) { + ret = ctx->check_extent_item(ctx->bytenr, ei, leaf, ctx->user_ctx); + if (ret) + return ret; + } + + flags = btrfs_extent_flags(leaf, ei); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + item_size; + + if (found_key.type == BTRFS_EXTENT_ITEM_KEY && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + + info = (struct btrfs_tree_block_info *)ptr; + *info_level = btrfs_tree_block_level(leaf, info); + ptr += sizeof(struct btrfs_tree_block_info); + BUG_ON(ptr > end); + } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) { + *info_level = found_key.offset; + } else { + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); + } + + while (ptr < end) { + struct btrfs_extent_inline_ref *iref; + u64 offset; + int type; + + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_get_extent_inline_ref_type(leaf, iref, + BTRFS_REF_TYPE_ANY); + if (type == BTRFS_REF_TYPE_INVALID) + return -EUCLEAN; + + offset = btrfs_extent_inline_ref_offset(leaf, iref); + + switch (type) { + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = add_direct_ref(ctx->fs_info, preftrees, + *info_level + 1, offset, + ctx->bytenr, 1, NULL, GFP_NOFS); + break; + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_shared_data_ref *sdref; + int count; + + sdref = (struct btrfs_shared_data_ref *)(iref + 1); + count = btrfs_shared_data_ref_count(leaf, sdref); + + ret = add_direct_ref(ctx->fs_info, preftrees, 0, offset, + ctx->bytenr, count, sc, GFP_NOFS); + break; + } + case BTRFS_TREE_BLOCK_REF_KEY: + ret = add_indirect_ref(ctx->fs_info, preftrees, offset, + NULL, *info_level + 1, + ctx->bytenr, 1, NULL, GFP_NOFS); + break; + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_extent_data_ref *dref; + int count; + u64 root; + + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + count = btrfs_extent_data_ref_count(leaf, dref); + key.objectid = btrfs_extent_data_ref_objectid(leaf, + dref); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = btrfs_extent_data_ref_offset(leaf, dref); + + if (sc && key.objectid != sc->inum && + !sc->have_delayed_delete_refs) { + ret = BACKREF_FOUND_SHARED; + break; + } + + root = btrfs_extent_data_ref_root(leaf, dref); + + if (!ctx->skip_data_ref || + !ctx->skip_data_ref(root, key.objectid, key.offset, + ctx->user_ctx)) + ret = add_indirect_ref(ctx->fs_info, preftrees, + root, &key, 0, ctx->bytenr, + count, sc, GFP_NOFS); + break; + } + default: + WARN_ON(1); + } + if (ret) + return ret; + ptr += btrfs_extent_inline_ref_size(type); + } + + return 0; +} + +/* + * add all non-inline backrefs for bytenr to the list + * + * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. + */ +static int add_keyed_refs(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_root *extent_root, + struct btrfs_path *path, + int info_level, struct preftrees *preftrees, + struct share_check *sc) +{ + struct btrfs_fs_info *fs_info = extent_root->fs_info; + int ret; + int slot; + struct extent_buffer *leaf; + struct btrfs_key key; + + while (1) { + ret = btrfs_next_item(extent_root, path); + if (ret < 0) + break; + if (ret) { + ret = 0; + break; + } + + slot = path->slots[0]; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.objectid != ctx->bytenr) + break; + if (key.type < BTRFS_TREE_BLOCK_REF_KEY) + continue; + if (key.type > BTRFS_SHARED_DATA_REF_KEY) + break; + + switch (key.type) { + case BTRFS_SHARED_BLOCK_REF_KEY: + /* SHARED DIRECT METADATA backref */ + ret = add_direct_ref(fs_info, preftrees, + info_level + 1, key.offset, + ctx->bytenr, 1, NULL, GFP_NOFS); + break; + case BTRFS_SHARED_DATA_REF_KEY: { + /* SHARED DIRECT FULL backref */ + struct btrfs_shared_data_ref *sdref; + int count; + + sdref = btrfs_item_ptr(leaf, slot, + struct btrfs_shared_data_ref); + count = btrfs_shared_data_ref_count(leaf, sdref); + ret = add_direct_ref(fs_info, preftrees, 0, + key.offset, ctx->bytenr, count, + sc, GFP_NOFS); + break; + } + case BTRFS_TREE_BLOCK_REF_KEY: + /* NORMAL INDIRECT METADATA backref */ + ret = add_indirect_ref(fs_info, preftrees, key.offset, + NULL, info_level + 1, ctx->bytenr, + 1, NULL, GFP_NOFS); + break; + case BTRFS_EXTENT_DATA_REF_KEY: { + /* NORMAL INDIRECT DATA backref */ + struct btrfs_extent_data_ref *dref; + int count; + u64 root; + + dref = btrfs_item_ptr(leaf, slot, + struct btrfs_extent_data_ref); + count = btrfs_extent_data_ref_count(leaf, dref); + key.objectid = btrfs_extent_data_ref_objectid(leaf, + dref); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = btrfs_extent_data_ref_offset(leaf, dref); + + if (sc && key.objectid != sc->inum && + !sc->have_delayed_delete_refs) { + ret = BACKREF_FOUND_SHARED; + break; + } + + root = btrfs_extent_data_ref_root(leaf, dref); + + if (!ctx->skip_data_ref || + !ctx->skip_data_ref(root, key.objectid, key.offset, + ctx->user_ctx)) + ret = add_indirect_ref(fs_info, preftrees, root, + &key, 0, ctx->bytenr, + count, sc, GFP_NOFS); + break; + } + default: + WARN_ON(1); + } + if (ret) + return ret; + + } + + return ret; +} + +/* + * The caller has joined a transaction or is holding a read lock on the + * fs_info->commit_root_sem semaphore, so no need to worry about the root's last + * snapshot field changing while updating or checking the cache. + */ +static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, + struct btrfs_root *root, + u64 bytenr, int level, bool *is_shared) +{ + const struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_backref_shared_cache_entry *entry; + + if (!current->journal_info) + lockdep_assert_held(&fs_info->commit_root_sem); + + if (!ctx->use_path_cache) + return false; + + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) + return false; + + /* + * Level -1 is used for the data extent, which is not reliable to cache + * because its reference count can increase or decrease without us + * realizing. We cache results only for extent buffers that lead from + * the root node down to the leaf with the file extent item. + */ + ASSERT(level >= 0); + + entry = &ctx->path_cache_entries[level]; + + /* Unused cache entry or being used for some other extent buffer. */ + if (entry->bytenr != bytenr) + return false; + + /* + * We cached a false result, but the last snapshot generation of the + * root changed, so we now have a snapshot. Don't trust the result. + */ + if (!entry->is_shared && + entry->gen != btrfs_root_last_snapshot(&root->root_item)) + return false; + + /* + * If we cached a true result and the last generation used for dropping + * a root changed, we can not trust the result, because the dropped root + * could be a snapshot sharing this extent buffer. + */ + if (entry->is_shared && + entry->gen != btrfs_get_last_root_drop_gen(fs_info)) + return false; + + *is_shared = entry->is_shared; + /* + * If the node at this level is shared, than all nodes below are also + * shared. Currently some of the nodes below may be marked as not shared + * because we have just switched from one leaf to another, and switched + * also other nodes above the leaf and below the current level, so mark + * them as shared. + */ + if (*is_shared) { + for (int i = 0; i < level; i++) { + ctx->path_cache_entries[i].is_shared = true; + ctx->path_cache_entries[i].gen = entry->gen; + } + } + + return true; +} + +/* + * The caller has joined a transaction or is holding a read lock on the + * fs_info->commit_root_sem semaphore, so no need to worry about the root's last + * snapshot field changing while updating or checking the cache. + */ +static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, + struct btrfs_root *root, + u64 bytenr, int level, bool is_shared) +{ + const struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_backref_shared_cache_entry *entry; + u64 gen; + + if (!current->journal_info) + lockdep_assert_held(&fs_info->commit_root_sem); + + if (!ctx->use_path_cache) + return; + + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) + return; + + /* + * Level -1 is used for the data extent, which is not reliable to cache + * because its reference count can increase or decrease without us + * realizing. We cache results only for extent buffers that lead from + * the root node down to the leaf with the file extent item. + */ + ASSERT(level >= 0); + + if (is_shared) + gen = btrfs_get_last_root_drop_gen(fs_info); + else + gen = btrfs_root_last_snapshot(&root->root_item); + + entry = &ctx->path_cache_entries[level]; + entry->bytenr = bytenr; + entry->is_shared = is_shared; + entry->gen = gen; + + /* + * If we found an extent buffer is shared, set the cache result for all + * extent buffers below it to true. As nodes in the path are COWed, + * their sharedness is moved to their children, and if a leaf is COWed, + * then the sharedness of a data extent becomes direct, the refcount of + * data extent is increased in the extent item at the extent tree. + */ + if (is_shared) { + for (int i = 0; i < level; i++) { + entry = &ctx->path_cache_entries[i]; + entry->is_shared = is_shared; + entry->gen = gen; + } + } +} + +/* + * this adds all existing backrefs (inline backrefs, backrefs and delayed + * refs) for the given bytenr to the refs list, merges duplicates and resolves + * indirect refs to their parent bytenr. + * When roots are found, they're added to the roots list + * + * @ctx: Backref walking context object, must be not NULL. + * @sc: If !NULL, then immediately return BACKREF_FOUND_SHARED when a + * shared extent is detected. + * + * Otherwise this returns 0 for success and <0 for an error. + * + * FIXME some caching might speed things up + */ +static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, + struct share_check *sc) +{ + struct btrfs_root *root = btrfs_extent_root(ctx->fs_info, ctx->bytenr); + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_delayed_ref_root *delayed_refs = NULL; + struct btrfs_delayed_ref_head *head; + int info_level = 0; + int ret; + struct prelim_ref *ref; + struct rb_node *node; + struct extent_inode_elem *eie = NULL; + struct preftrees preftrees = { + .direct = PREFTREE_INIT, + .indirect = PREFTREE_INIT, + .indirect_missing_keys = PREFTREE_INIT + }; + + /* Roots ulist is not needed when using a sharedness check context. */ + if (sc) + ASSERT(ctx->roots == NULL); + + key.objectid = ctx->bytenr; + key.offset = (u64)-1; + if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + if (!ctx->trans) { + path->search_commit_root = 1; + path->skip_locking = 1; + } + + if (ctx->time_seq == BTRFS_SEQ_LAST) + path->skip_locking = 1; + +again: + head = NULL; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret == 0) { + /* This shouldn't happen, indicates a bug or fs corruption. */ + ASSERT(ret != 0); + ret = -EUCLEAN; + goto out; + } + + if (ctx->trans && likely(ctx->trans->type != __TRANS_DUMMY) && + ctx->time_seq != BTRFS_SEQ_LAST) { + /* + * We have a specific time_seq we care about and trans which + * means we have the path lock, we need to grab the ref head and + * lock it so we have a consistent view of the refs at the given + * time. + */ + delayed_refs = &ctx->trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + /* + * Mutex was contended, block until it's + * released and try again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref_head(head); + goto again; + } + spin_unlock(&delayed_refs->lock); + ret = add_delayed_refs(ctx->fs_info, head, ctx->time_seq, + &preftrees, sc); + mutex_unlock(&head->mutex); + if (ret) + goto out; + } else { + spin_unlock(&delayed_refs->lock); + } + } + + if (path->slots[0]) { + struct extent_buffer *leaf; + int slot; + + path->slots[0]--; + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid == ctx->bytenr && + (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY)) { + ret = add_inline_refs(ctx, path, &info_level, + &preftrees, sc); + if (ret) + goto out; + ret = add_keyed_refs(ctx, root, path, info_level, + &preftrees, sc); + if (ret) + goto out; + } + } + + /* + * If we have a share context and we reached here, it means the extent + * is not directly shared (no multiple reference items for it), + * otherwise we would have exited earlier with a return value of + * BACKREF_FOUND_SHARED after processing delayed references or while + * processing inline or keyed references from the extent tree. + * The extent may however be indirectly shared through shared subtrees + * as a result from creating snapshots, so we determine below what is + * its parent node, in case we are dealing with a metadata extent, or + * what's the leaf (or leaves), from a fs tree, that has a file extent + * item pointing to it in case we are dealing with a data extent. + */ + ASSERT(extent_is_shared(sc) == 0); + + /* + * If we are here for a data extent and we have a share_check structure + * it means the data extent is not directly shared (does not have + * multiple reference items), so we have to check if a path in the fs + * tree (going from the root node down to the leaf that has the file + * extent item pointing to the data extent) is shared, that is, if any + * of the extent buffers in the path is referenced by other trees. + */ + if (sc && ctx->bytenr == sc->data_bytenr) { + /* + * If our data extent is from a generation more recent than the + * last generation used to snapshot the root, then we know that + * it can not be shared through subtrees, so we can skip + * resolving indirect references, there's no point in + * determining the extent buffers for the path from the fs tree + * root node down to the leaf that has the file extent item that + * points to the data extent. + */ + if (sc->data_extent_gen > + btrfs_root_last_snapshot(&sc->root->root_item)) { + ret = BACKREF_FOUND_NOT_SHARED; + goto out; + } + + /* + * If we are only determining if a data extent is shared or not + * and the corresponding file extent item is located in the same + * leaf as the previous file extent item, we can skip resolving + * indirect references for a data extent, since the fs tree path + * is the same (same leaf, so same path). We skip as long as the + * cached result for the leaf is valid and only if there's only + * one file extent item pointing to the data extent, because in + * the case of multiple file extent items, they may be located + * in different leaves and therefore we have multiple paths. + */ + if (sc->ctx->curr_leaf_bytenr == sc->ctx->prev_leaf_bytenr && + sc->self_ref_count == 1) { + bool cached; + bool is_shared; + + cached = lookup_backref_shared_cache(sc->ctx, sc->root, + sc->ctx->curr_leaf_bytenr, + 0, &is_shared); + if (cached) { + if (is_shared) + ret = BACKREF_FOUND_SHARED; + else + ret = BACKREF_FOUND_NOT_SHARED; + goto out; + } + } + } + + btrfs_release_path(path); + + ret = add_missing_keys(ctx->fs_info, &preftrees, path->skip_locking == 0); + if (ret) + goto out; + + WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root)); + + ret = resolve_indirect_refs(ctx, path, &preftrees, sc); + if (ret) + goto out; + + WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root.rb_root)); + + /* + * This walks the tree of merged and resolved refs. Tree blocks are + * read in as needed. Unique entries are added to the ulist, and + * the list of found roots is updated. + * + * We release the entire tree in one go before returning. + */ + node = rb_first_cached(&preftrees.direct.root); + while (node) { + ref = rb_entry(node, struct prelim_ref, rbnode); + node = rb_next(&ref->rbnode); + /* + * ref->count < 0 can happen here if there are delayed + * refs with a node->action of BTRFS_DROP_DELAYED_REF. + * prelim_ref_insert() relies on this when merging + * identical refs to keep the overall count correct. + * prelim_ref_insert() will merge only those refs + * which compare identically. Any refs having + * e.g. different offsets would not be merged, + * and would retain their original ref->count < 0. + */ + if (ctx->roots && ref->count && ref->root_id && ref->parent == 0) { + /* no parent == root of tree */ + ret = ulist_add(ctx->roots, ref->root_id, 0, GFP_NOFS); + if (ret < 0) + goto out; + } + if (ref->count && ref->parent) { + if (!ctx->skip_inode_ref_list && !ref->inode_list && + ref->level == 0) { + struct btrfs_tree_parent_check check = { 0 }; + struct extent_buffer *eb; + + check.level = ref->level; + + eb = read_tree_block(ctx->fs_info, ref->parent, + &check); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } + if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + ret = -EIO; + goto out; + } + + if (!path->skip_locking) + btrfs_tree_read_lock(eb); + ret = find_extent_in_eb(ctx, eb, &eie); + if (!path->skip_locking) + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || + ret < 0) + goto out; + ref->inode_list = eie; + /* + * We transferred the list ownership to the ref, + * so set to NULL to avoid a double free in case + * an error happens after this. + */ + eie = NULL; + } + ret = ulist_add_merge_ptr(ctx->refs, ref->parent, + ref->inode_list, + (void **)&eie, GFP_NOFS); + if (ret < 0) + goto out; + if (!ret && !ctx->skip_inode_ref_list) { + /* + * We've recorded that parent, so we must extend + * its inode list here. + * + * However if there was corruption we may not + * have found an eie, return an error in this + * case. + */ + ASSERT(eie); + if (!eie) { + ret = -EUCLEAN; + goto out; + } + while (eie->next) + eie = eie->next; + eie->next = ref->inode_list; + } + eie = NULL; + /* + * We have transferred the inode list ownership from + * this ref to the ref we added to the 'refs' ulist. + * So set this ref's inode list to NULL to avoid + * use-after-free when our caller uses it or double + * frees in case an error happens before we return. + */ + ref->inode_list = NULL; + } + cond_resched(); + } + +out: + btrfs_free_path(path); + + prelim_release(&preftrees.direct); + prelim_release(&preftrees.indirect); + prelim_release(&preftrees.indirect_missing_keys); + + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) + free_inode_elem_list(eie); + return ret; +} + +/* + * Finds all leaves with a reference to the specified combination of + * @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are + * added to the ulist at @ctx->refs, and that ulist is allocated by this + * function. The caller should free the ulist with free_leaf_list() if + * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is + * enough. + * + * Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated. + */ +int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx) +{ + int ret; + + ASSERT(ctx->refs == NULL); + + ctx->refs = ulist_alloc(GFP_NOFS); + if (!ctx->refs) + return -ENOMEM; + + ret = find_parent_nodes(ctx, NULL); + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || + (ret < 0 && ret != -ENOENT)) { + free_leaf_list(ctx->refs); + ctx->refs = NULL; + return ret; + } + + return 0; +} + +/* + * Walk all backrefs for a given extent to find all roots that reference this + * extent. Walking a backref means finding all extents that reference this + * extent and in turn walk the backrefs of those, too. Naturally this is a + * recursive process, but here it is implemented in an iterative fashion: We + * find all referencing extents for the extent in question and put them on a + * list. In turn, we find all referencing extents for those, further appending + * to the list. The way we iterate the list allows adding more elements after + * the current while iterating. The process stops when we reach the end of the + * list. + * + * Found roots are added to @ctx->roots, which is allocated by this function if + * it points to NULL, in which case the caller is responsible for freeing it + * after it's not needed anymore. + * This function requires @ctx->refs to be NULL, as it uses it for allocating a + * ulist to do temporary work, and frees it before returning. + * + * Returns 0 on success, < 0 on error. + */ +static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx) +{ + const u64 orig_bytenr = ctx->bytenr; + const bool orig_skip_inode_ref_list = ctx->skip_inode_ref_list; + bool roots_ulist_allocated = false; + struct ulist_iterator uiter; + int ret = 0; + + ASSERT(ctx->refs == NULL); + + ctx->refs = ulist_alloc(GFP_NOFS); + if (!ctx->refs) + return -ENOMEM; + + if (!ctx->roots) { + ctx->roots = ulist_alloc(GFP_NOFS); + if (!ctx->roots) { + ulist_free(ctx->refs); + ctx->refs = NULL; + return -ENOMEM; + } + roots_ulist_allocated = true; + } + + ctx->skip_inode_ref_list = true; + + ULIST_ITER_INIT(&uiter); + while (1) { + struct ulist_node *node; + + ret = find_parent_nodes(ctx, NULL); + if (ret < 0 && ret != -ENOENT) { + if (roots_ulist_allocated) { + ulist_free(ctx->roots); + ctx->roots = NULL; + } + break; + } + ret = 0; + node = ulist_next(ctx->refs, &uiter); + if (!node) + break; + ctx->bytenr = node->val; + cond_resched(); + } + + ulist_free(ctx->refs); + ctx->refs = NULL; + ctx->bytenr = orig_bytenr; + ctx->skip_inode_ref_list = orig_skip_inode_ref_list; + + return ret; +} + +int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx, + bool skip_commit_root_sem) +{ + int ret; + + if (!ctx->trans && !skip_commit_root_sem) + down_read(&ctx->fs_info->commit_root_sem); + ret = btrfs_find_all_roots_safe(ctx); + if (!ctx->trans && !skip_commit_root_sem) + up_read(&ctx->fs_info->commit_root_sem); + return ret; +} + +struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void) +{ + struct btrfs_backref_share_check_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + ulist_init(&ctx->refs); + + return ctx; +} + +void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx) +{ + if (!ctx) + return; + + ulist_release(&ctx->refs); + kfree(ctx); +} + +/* + * Check if a data extent is shared or not. + * + * @inode: The inode whose extent we are checking. + * @bytenr: Logical bytenr of the extent we are checking. + * @extent_gen: Generation of the extent (file extent item) or 0 if it is + * not known. + * @ctx: A backref sharedness check context. + * + * btrfs_is_data_extent_shared uses the backref walking code but will short + * circuit as soon as it finds a root or inode that doesn't match the + * one passed in. This provides a significant performance benefit for + * callers (such as fiemap) which want to know whether the extent is + * shared but do not need a ref count. + * + * This attempts to attach to the running transaction in order to account for + * delayed refs, but continues on even when no running transaction exists. + * + * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. + */ +int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, + u64 extent_gen, + struct btrfs_backref_share_check_ctx *ctx) +{ + struct btrfs_backref_walk_ctx walk_ctx = { 0 }; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_trans_handle *trans; + struct ulist_iterator uiter; + struct ulist_node *node; + struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem); + int ret = 0; + struct share_check shared = { + .ctx = ctx, + .root = root, + .inum = btrfs_ino(inode), + .data_bytenr = bytenr, + .data_extent_gen = extent_gen, + .share_count = 0, + .self_ref_count = 0, + .have_delayed_delete_refs = false, + }; + int level; + bool leaf_cached; + bool leaf_is_shared; + + for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) { + if (ctx->prev_extents_cache[i].bytenr == bytenr) + return ctx->prev_extents_cache[i].is_shared; + } + + ulist_init(&ctx->refs); + + trans = btrfs_join_transaction_nostart(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) { + ret = PTR_ERR(trans); + goto out; + } + trans = NULL; + down_read(&fs_info->commit_root_sem); + } else { + btrfs_get_tree_mod_seq(fs_info, &elem); + walk_ctx.time_seq = elem.seq; + } + + ctx->use_path_cache = true; + + /* + * We may have previously determined that the current leaf is shared. + * If it is, then we have a data extent that is shared due to a shared + * subtree (caused by snapshotting) and we don't need to check for data + * backrefs. If the leaf is not shared, then we must do backref walking + * to determine if the data extent is shared through reflinks. + */ + leaf_cached = lookup_backref_shared_cache(ctx, root, + ctx->curr_leaf_bytenr, 0, + &leaf_is_shared); + if (leaf_cached && leaf_is_shared) { + ret = 1; + goto out_trans; + } + + walk_ctx.skip_inode_ref_list = true; + walk_ctx.trans = trans; + walk_ctx.fs_info = fs_info; + walk_ctx.refs = &ctx->refs; + + /* -1 means we are in the bytenr of the data extent. */ + level = -1; + ULIST_ITER_INIT(&uiter); + while (1) { + const unsigned long prev_ref_count = ctx->refs.nnodes; + + walk_ctx.bytenr = bytenr; + ret = find_parent_nodes(&walk_ctx, &shared); + if (ret == BACKREF_FOUND_SHARED || + ret == BACKREF_FOUND_NOT_SHARED) { + /* If shared must return 1, otherwise return 0. */ + ret = (ret == BACKREF_FOUND_SHARED) ? 1 : 0; + if (level >= 0) + store_backref_shared_cache(ctx, root, bytenr, + level, ret == 1); + break; + } + if (ret < 0 && ret != -ENOENT) + break; + ret = 0; + + /* + * More than one extent buffer (bytenr) may have been added to + * the ctx->refs ulist, in which case we have to check multiple + * tree paths in case the first one is not shared, so we can not + * use the path cache which is made for a single path. Multiple + * extent buffers at the current level happen when: + * + * 1) level -1, the data extent: If our data extent was not + * directly shared (without multiple reference items), then + * it might have a single reference item with a count > 1 for + * the same offset, which means there are 2 (or more) file + * extent items that point to the data extent - this happens + * when a file extent item needs to be split and then one + * item gets moved to another leaf due to a b+tree leaf split + * when inserting some item. In this case the file extent + * items may be located in different leaves and therefore + * some of the leaves may be referenced through shared + * subtrees while others are not. Since our extent buffer + * cache only works for a single path (by far the most common + * case and simpler to deal with), we can not use it if we + * have multiple leaves (which implies multiple paths). + * + * 2) level >= 0, a tree node/leaf: We can have a mix of direct + * and indirect references on a b+tree node/leaf, so we have + * to check multiple paths, and the extent buffer (the + * current bytenr) may be shared or not. One example is + * during relocation as we may get a shared tree block ref + * (direct ref) and a non-shared tree block ref (indirect + * ref) for the same node/leaf. + */ + if ((ctx->refs.nnodes - prev_ref_count) > 1) + ctx->use_path_cache = false; + + if (level >= 0) + store_backref_shared_cache(ctx, root, bytenr, + level, false); + node = ulist_next(&ctx->refs, &uiter); + if (!node) + break; + bytenr = node->val; + if (ctx->use_path_cache) { + bool is_shared; + bool cached; + + level++; + cached = lookup_backref_shared_cache(ctx, root, bytenr, + level, &is_shared); + if (cached) { + ret = (is_shared ? 1 : 0); + break; + } + } + shared.share_count = 0; + shared.have_delayed_delete_refs = false; + cond_resched(); + } + + /* + * If the path cache is disabled, then it means at some tree level we + * got multiple parents due to a mix of direct and indirect backrefs or + * multiple leaves with file extent items pointing to the same data + * extent. We have to invalidate the cache and cache only the sharedness + * result for the levels where we got only one node/reference. + */ + if (!ctx->use_path_cache) { + int i = 0; + + level--; + if (ret >= 0 && level >= 0) { + bytenr = ctx->path_cache_entries[level].bytenr; + ctx->use_path_cache = true; + store_backref_shared_cache(ctx, root, bytenr, level, ret); + i = level + 1; + } + + for ( ; i < BTRFS_MAX_LEVEL; i++) + ctx->path_cache_entries[i].bytenr = 0; + } + + /* + * Cache the sharedness result for the data extent if we know our inode + * has more than 1 file extent item that refers to the data extent. + */ + if (ret >= 0 && shared.self_ref_count > 1) { + int slot = ctx->prev_extents_cache_slot; + + ctx->prev_extents_cache[slot].bytenr = shared.data_bytenr; + ctx->prev_extents_cache[slot].is_shared = (ret == 1); + + slot = (slot + 1) % BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; + ctx->prev_extents_cache_slot = slot; + } + +out_trans: + if (trans) { + btrfs_put_tree_mod_seq(fs_info, &elem); + btrfs_end_transaction(trans); + } else { + up_read(&fs_info->commit_root_sem); + } +out: + ulist_release(&ctx->refs); + ctx->prev_leaf_bytenr = ctx->curr_leaf_bytenr; + + return ret; +} + +int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, + u64 start_off, struct btrfs_path *path, + struct btrfs_inode_extref **ret_extref, + u64 *found_off) +{ + int ret, slot; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_inode_extref *extref; + const struct extent_buffer *leaf; + unsigned long ptr; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = start_off; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + /* + * If the item at offset is not found, + * btrfs_search_slot will point us to the slot + * where it should be inserted. In our case + * that will be the slot directly before the + * next INODE_REF_KEY_V2 item. In the case + * that we're pointing to the last slot in a + * leaf, we must move one leaf over. + */ + ret = btrfs_next_leaf(root, path); + if (ret) { + if (ret >= 1) + ret = -ENOENT; + break; + } + continue; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* + * Check that we're still looking at an extended ref key for + * this particular objectid. If we have different + * objectid or type then there are no more to be found + * in the tree and we can exit. + */ + ret = -ENOENT; + if (found_key.objectid != inode_objectid) + break; + if (found_key.type != BTRFS_INODE_EXTREF_KEY) + break; + + ret = 0; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + extref = (struct btrfs_inode_extref *)ptr; + *ret_extref = extref; + if (found_off) + *found_off = found_key.offset; + break; + } + + return ret; +} + +/* + * this iterates to turn a name (from iref/extref) into a full filesystem path. + * Elements of the path are separated by '/' and the path is guaranteed to be + * 0-terminated. the path is only given within the current file system. + * Therefore, it never starts with a '/'. the caller is responsible to provide + * "size" bytes in "dest". the dest buffer will be filled backwards. finally, + * the start point of the resulting string is returned. this pointer is within + * dest, normally. + * in case the path buffer would overflow, the pointer is decremented further + * as if output was written to the buffer, though no more output is actually + * generated. that way, the caller can determine how much space would be + * required for the path to fit into the buffer. in that case, the returned + * value will be smaller than dest. callers must check this! + */ +char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + u32 name_len, unsigned long name_off, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size) +{ + int slot; + u64 next_inum; + int ret; + s64 bytes_left = ((s64)size) - 1; + struct extent_buffer *eb = eb_in; + struct btrfs_key found_key; + struct btrfs_inode_ref *iref; + + if (bytes_left >= 0) + dest[bytes_left] = '\0'; + + while (1) { + bytes_left -= name_len; + if (bytes_left >= 0) + read_extent_buffer(eb, dest + bytes_left, + name_off, name_len); + if (eb != eb_in) { + if (!path->skip_locking) + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + ret = btrfs_find_item(fs_root, path, parent, 0, + BTRFS_INODE_REF_KEY, &found_key); + if (ret > 0) + ret = -ENOENT; + if (ret) + break; + + next_inum = found_key.offset; + + /* regular exit ahead */ + if (parent == next_inum) + break; + + slot = path->slots[0]; + eb = path->nodes[0]; + /* make sure we can use eb after releasing the path */ + if (eb != eb_in) { + path->nodes[0] = NULL; + path->locks[0] = 0; + } + btrfs_release_path(path); + iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + + name_len = btrfs_inode_ref_name_len(eb, iref); + name_off = (unsigned long)(iref + 1); + + parent = next_inum; + --bytes_left; + if (bytes_left >= 0) + dest[bytes_left] = '/'; + } + + btrfs_release_path(path); + + if (ret) + return ERR_PTR(ret); + + return dest + bytes_left; +} + +/* + * this makes the path point to (logical EXTENT_ITEM *) + * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for + * tree blocks and <0 on error. + */ +int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, + struct btrfs_path *path, struct btrfs_key *found_key, + u64 *flags_ret) +{ + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical); + int ret; + u64 flags; + u64 size = 0; + u32 item_size; + const struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_key key; + + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = logical; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + ret = btrfs_previous_extent_item(extent_root, path, 0); + if (ret) { + if (ret > 0) + ret = -ENOENT; + return ret; + } + btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); + if (found_key->type == BTRFS_METADATA_ITEM_KEY) + size = fs_info->nodesize; + else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) + size = found_key->offset; + + if (found_key->objectid > logical || + found_key->objectid + size <= logical) { + btrfs_debug(fs_info, + "logical %llu is not within any extent", logical); + return -ENOENT; + } + + eb = path->nodes[0]; + item_size = btrfs_item_size(eb, path->slots[0]); + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + flags = btrfs_extent_flags(eb, ei); + + btrfs_debug(fs_info, + "logical %llu is at position %llu within the extent (%llu EXTENT_ITEM %llu) flags %#llx size %u", + logical, logical - found_key->objectid, found_key->objectid, + found_key->offset, flags, item_size); + + WARN_ON(!flags_ret); + if (flags_ret) { + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK; + else if (flags & BTRFS_EXTENT_FLAG_DATA) + *flags_ret = BTRFS_EXTENT_FLAG_DATA; + else + BUG(); + return 0; + } + + return -EIO; +} + +/* + * helper function to iterate extent inline refs. ptr must point to a 0 value + * for the first call and may be modified. it is used to track state. + * if more refs exist, 0 is returned and the next call to + * get_extent_inline_ref must pass the modified ptr parameter to get the + * next ref. after the last ref was processed, 1 is returned. + * returns <0 on error + */ +static int get_extent_inline_ref(unsigned long *ptr, + const struct extent_buffer *eb, + const struct btrfs_key *key, + const struct btrfs_extent_item *ei, + u32 item_size, + struct btrfs_extent_inline_ref **out_eiref, + int *out_type) +{ + unsigned long end; + u64 flags; + struct btrfs_tree_block_info *info; + + if (!*ptr) { + /* first call */ + flags = btrfs_extent_flags(eb, ei); + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if (key->type == BTRFS_METADATA_ITEM_KEY) { + /* a skinny metadata extent */ + *out_eiref = + (struct btrfs_extent_inline_ref *)(ei + 1); + } else { + WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY); + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_eiref = + (struct btrfs_extent_inline_ref *)(info + 1); + } + } else { + *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + *ptr = (unsigned long)*out_eiref; + if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size) + return -ENOENT; + } + + end = (unsigned long)ei + item_size; + *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr); + *out_type = btrfs_get_extent_inline_ref_type(eb, *out_eiref, + BTRFS_REF_TYPE_ANY); + if (*out_type == BTRFS_REF_TYPE_INVALID) + return -EUCLEAN; + + *ptr += btrfs_extent_inline_ref_size(*out_type); + WARN_ON(*ptr > end); + if (*ptr == end) + return 1; /* last */ + + return 0; +} + +/* + * reads the tree block backref for an extent. tree level and root are returned + * through out_level and out_root. ptr must point to a 0 value for the first + * call and may be modified (see get_extent_inline_ref comment). + * returns 0 if data was provided, 1 if there was no more data to provide or + * <0 on error. + */ +int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_key *key, struct btrfs_extent_item *ei, + u32 item_size, u64 *out_root, u8 *out_level) +{ + int ret; + int type; + struct btrfs_extent_inline_ref *eiref; + + if (*ptr == (unsigned long)-1) + return 1; + + while (1) { + ret = get_extent_inline_ref(ptr, eb, key, ei, item_size, + &eiref, &type); + if (ret < 0) + return ret; + + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + break; + + if (ret == 1) + return 1; + } + + /* we can treat both ref types equally here */ + *out_root = btrfs_extent_inline_ref_offset(eb, eiref); + + if (key->type == BTRFS_EXTENT_ITEM_KEY) { + struct btrfs_tree_block_info *info; + + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_level = btrfs_tree_block_level(eb, info); + } else { + ASSERT(key->type == BTRFS_METADATA_ITEM_KEY); + *out_level = (u8)key->offset; + } + + if (ret == 1) + *ptr = (unsigned long)-1; + + return 0; +} + +static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, + struct extent_inode_elem *inode_list, + u64 root, u64 extent_item_objectid, + iterate_extent_inodes_t *iterate, void *ctx) +{ + struct extent_inode_elem *eie; + int ret = 0; + + for (eie = inode_list; eie; eie = eie->next) { + btrfs_debug(fs_info, + "ref for %llu resolved, key (%llu EXTEND_DATA %llu), root %llu", + extent_item_objectid, eie->inum, + eie->offset, root); + ret = iterate(eie->inum, eie->offset, eie->num_bytes, root, ctx); + if (ret) { + btrfs_debug(fs_info, + "stopping iteration for %llu due to ret=%d", + extent_item_objectid, ret); + break; + } + } + + return ret; +} + +/* + * calls iterate() for every inode that references the extent identified by + * the given parameters. + * when the iterator function returns a non-zero value, iteration stops. + */ +int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, + bool search_commit_root, + iterate_extent_inodes_t *iterate, void *user_ctx) +{ + int ret; + struct ulist *refs; + struct ulist_node *ref_node; + struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem); + struct ulist_iterator ref_uiter; + + btrfs_debug(ctx->fs_info, "resolving all inodes for extent %llu", + ctx->bytenr); + + ASSERT(ctx->trans == NULL); + ASSERT(ctx->roots == NULL); + + if (!search_commit_root) { + struct btrfs_trans_handle *trans; + + trans = btrfs_attach_transaction(ctx->fs_info->tree_root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT && + PTR_ERR(trans) != -EROFS) + return PTR_ERR(trans); + trans = NULL; + } + ctx->trans = trans; + } + + if (ctx->trans) { + btrfs_get_tree_mod_seq(ctx->fs_info, &seq_elem); + ctx->time_seq = seq_elem.seq; + } else { + down_read(&ctx->fs_info->commit_root_sem); + } + + ret = btrfs_find_all_leafs(ctx); + if (ret) + goto out; + refs = ctx->refs; + ctx->refs = NULL; + + ULIST_ITER_INIT(&ref_uiter); + while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { + const u64 leaf_bytenr = ref_node->val; + struct ulist_node *root_node; + struct ulist_iterator root_uiter; + struct extent_inode_elem *inode_list; + + inode_list = (struct extent_inode_elem *)(uintptr_t)ref_node->aux; + + if (ctx->cache_lookup) { + const u64 *root_ids; + int root_count; + bool cached; + + cached = ctx->cache_lookup(leaf_bytenr, ctx->user_ctx, + &root_ids, &root_count); + if (cached) { + for (int i = 0; i < root_count; i++) { + ret = iterate_leaf_refs(ctx->fs_info, + inode_list, + root_ids[i], + leaf_bytenr, + iterate, + user_ctx); + if (ret) + break; + } + continue; + } + } + + if (!ctx->roots) { + ctx->roots = ulist_alloc(GFP_NOFS); + if (!ctx->roots) { + ret = -ENOMEM; + break; + } + } + + ctx->bytenr = leaf_bytenr; + ret = btrfs_find_all_roots_safe(ctx); + if (ret) + break; + + if (ctx->cache_store) + ctx->cache_store(leaf_bytenr, ctx->roots, ctx->user_ctx); + + ULIST_ITER_INIT(&root_uiter); + while (!ret && (root_node = ulist_next(ctx->roots, &root_uiter))) { + btrfs_debug(ctx->fs_info, + "root %llu references leaf %llu, data list %#llx", + root_node->val, ref_node->val, + ref_node->aux); + ret = iterate_leaf_refs(ctx->fs_info, inode_list, + root_node->val, ctx->bytenr, + iterate, user_ctx); + } + ulist_reinit(ctx->roots); + } + + free_leaf_list(refs); +out: + if (ctx->trans) { + btrfs_put_tree_mod_seq(ctx->fs_info, &seq_elem); + btrfs_end_transaction(ctx->trans); + ctx->trans = NULL; + } else { + up_read(&ctx->fs_info->commit_root_sem); + } + + ulist_free(ctx->roots); + ctx->roots = NULL; + + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP) + ret = 0; + + return ret; +} + +static int build_ino_list(u64 inum, u64 offset, u64 num_bytes, u64 root, void *ctx) +{ + struct btrfs_data_container *inodes = ctx; + const size_t c = 3 * sizeof(u64); + + if (inodes->bytes_left >= c) { + inodes->bytes_left -= c; + inodes->val[inodes->elem_cnt] = inum; + inodes->val[inodes->elem_cnt + 1] = offset; + inodes->val[inodes->elem_cnt + 2] = root; + inodes->elem_cnt += 3; + } else { + inodes->bytes_missing += c - inodes->bytes_left; + inodes->bytes_left = 0; + inodes->elem_missed += 3; + } + + return 0; +} + +int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + void *ctx, bool ignore_offset) +{ + struct btrfs_backref_walk_ctx walk_ctx = { 0 }; + int ret; + u64 flags = 0; + struct btrfs_key found_key; + int search_commit_root = path->search_commit_root; + + ret = extent_from_logical(fs_info, logical, path, &found_key, &flags); + btrfs_release_path(path); + if (ret < 0) + return ret; + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + return -EINVAL; + + walk_ctx.bytenr = found_key.objectid; + if (ignore_offset) + walk_ctx.ignore_extent_item_pos = true; + else + walk_ctx.extent_item_pos = logical - found_key.objectid; + walk_ctx.fs_info = fs_info; + + return iterate_extent_inodes(&walk_ctx, search_commit_root, + build_ino_list, ctx); +} + +static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, struct inode_fs_paths *ipath); + +static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath) +{ + int ret = 0; + int slot; + u32 cur; + u32 len; + u32 name_len; + u64 parent = 0; + int found = 0; + struct btrfs_root *fs_root = ipath->fs_root; + struct btrfs_path *path = ipath->btrfs_path; + struct extent_buffer *eb; + struct btrfs_inode_ref *iref; + struct btrfs_key found_key; + + while (!ret) { + ret = btrfs_find_item(fs_root, path, inum, + parent ? parent + 1 : 0, BTRFS_INODE_REF_KEY, + &found_key); + + if (ret < 0) + break; + if (ret) { + ret = found ? 0 : -ENOENT; + break; + } + ++found; + + parent = found_key.offset; + slot = path->slots[0]; + eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!eb) { + ret = -ENOMEM; + break; + } + btrfs_release_path(path); + + iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + + for (cur = 0; cur < btrfs_item_size(eb, slot); cur += len) { + name_len = btrfs_inode_ref_name_len(eb, iref); + /* path must be released before calling iterate()! */ + btrfs_debug(fs_root->fs_info, + "following ref at offset %u for inode %llu in tree %llu", + cur, found_key.objectid, + fs_root->root_key.objectid); + ret = inode_to_path(parent, name_len, + (unsigned long)(iref + 1), eb, ipath); + if (ret) + break; + len = sizeof(*iref) + name_len; + iref = (struct btrfs_inode_ref *)((char *)iref + len); + } + free_extent_buffer(eb); + } + + btrfs_release_path(path); + + return ret; +} + +static int iterate_inode_extrefs(u64 inum, struct inode_fs_paths *ipath) +{ + int ret; + int slot; + u64 offset = 0; + u64 parent; + int found = 0; + struct btrfs_root *fs_root = ipath->fs_root; + struct btrfs_path *path = ipath->btrfs_path; + struct extent_buffer *eb; + struct btrfs_inode_extref *extref; + u32 item_size; + u32 cur_offset; + unsigned long ptr; + + while (1) { + ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref, + &offset); + if (ret < 0) + break; + if (ret) { + ret = found ? 0 : -ENOENT; + break; + } + ++found; + + slot = path->slots[0]; + eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!eb) { + ret = -ENOMEM; + break; + } + btrfs_release_path(path); + + item_size = btrfs_item_size(eb, slot); + ptr = btrfs_item_ptr_offset(eb, slot); + cur_offset = 0; + + while (cur_offset < item_size) { + u32 name_len; + + extref = (struct btrfs_inode_extref *)(ptr + cur_offset); + parent = btrfs_inode_extref_parent(eb, extref); + name_len = btrfs_inode_extref_name_len(eb, extref); + ret = inode_to_path(parent, name_len, + (unsigned long)&extref->name, eb, ipath); + if (ret) + break; + + cur_offset += btrfs_inode_extref_name_len(eb, extref); + cur_offset += sizeof(*extref); + } + free_extent_buffer(eb); + + offset++; + } + + btrfs_release_path(path); + + return ret; +} + +/* + * returns 0 if the path could be dumped (probably truncated) + * returns <0 in case of an error + */ +static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, struct inode_fs_paths *ipath) +{ + char *fspath; + char *fspath_min; + int i = ipath->fspath->elem_cnt; + const int s_ptr = sizeof(char *); + u32 bytes_left; + + bytes_left = ipath->fspath->bytes_left > s_ptr ? + ipath->fspath->bytes_left - s_ptr : 0; + + fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; + fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len, + name_off, eb, inum, fspath_min, bytes_left); + if (IS_ERR(fspath)) + return PTR_ERR(fspath); + + if (fspath > fspath_min) { + ipath->fspath->val[i] = (u64)(unsigned long)fspath; + ++ipath->fspath->elem_cnt; + ipath->fspath->bytes_left = fspath - fspath_min; + } else { + ++ipath->fspath->elem_missed; + ipath->fspath->bytes_missing += fspath_min - fspath; + ipath->fspath->bytes_left = 0; + } + + return 0; +} + +/* + * this dumps all file system paths to the inode into the ipath struct, provided + * is has been created large enough. each path is zero-terminated and accessed + * from ipath->fspath->val[i]. + * when it returns, there are ipath->fspath->elem_cnt number of paths available + * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the + * number of missed paths is recorded in ipath->fspath->elem_missed, otherwise, + * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would + * have been needed to return all paths. + */ +int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) +{ + int ret; + int found_refs = 0; + + ret = iterate_inode_refs(inum, ipath); + if (!ret) + ++found_refs; + else if (ret != -ENOENT) + return ret; + + ret = iterate_inode_extrefs(inum, ipath); + if (ret == -ENOENT && found_refs) + return 0; + + return ret; +} + +struct btrfs_data_container *init_data_container(u32 total_bytes) +{ + struct btrfs_data_container *data; + size_t alloc_bytes; + + alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); + data = kvmalloc(alloc_bytes, GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + if (total_bytes >= sizeof(*data)) { + data->bytes_left = total_bytes - sizeof(*data); + data->bytes_missing = 0; + } else { + data->bytes_missing = sizeof(*data) - total_bytes; + data->bytes_left = 0; + } + + data->elem_cnt = 0; + data->elem_missed = 0; + + return data; +} + +/* + * allocates space to return multiple file system paths for an inode. + * total_bytes to allocate are passed, note that space usable for actual path + * information will be total_bytes - sizeof(struct inode_fs_paths). + * the returned pointer must be freed with free_ipath() in the end. + */ +struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, + struct btrfs_path *path) +{ + struct inode_fs_paths *ifp; + struct btrfs_data_container *fspath; + + fspath = init_data_container(total_bytes); + if (IS_ERR(fspath)) + return ERR_CAST(fspath); + + ifp = kmalloc(sizeof(*ifp), GFP_KERNEL); + if (!ifp) { + kvfree(fspath); + return ERR_PTR(-ENOMEM); + } + + ifp->btrfs_path = path; + ifp->fspath = fspath; + ifp->fs_root = fs_root; + + return ifp; +} + +void free_ipath(struct inode_fs_paths *ipath) +{ + if (!ipath) + return; + kvfree(ipath->fspath); + kfree(ipath); +} + +struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info) +{ + struct btrfs_backref_iter *ret; + + ret = kzalloc(sizeof(*ret), GFP_NOFS); + if (!ret) + return NULL; + + ret->path = btrfs_alloc_path(); + if (!ret->path) { + kfree(ret); + return NULL; + } + + /* Current backref iterator only supports iteration in commit root */ + ret->path->search_commit_root = 1; + ret->path->skip_locking = 1; + ret->fs_info = fs_info; + + return ret; +} + +int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) +{ + struct btrfs_fs_info *fs_info = iter->fs_info; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); + struct btrfs_path *path = iter->path; + struct btrfs_extent_item *ei; + struct btrfs_key key; + int ret; + + key.objectid = bytenr; + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = (u64)-1; + iter->bytenr = bytenr; + + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + return ret; + if (ret == 0) { + ret = -EUCLEAN; + goto release; + } + if (path->slots[0] == 0) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + ret = -EUCLEAN; + goto release; + } + path->slots[0]--; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if ((key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) || key.objectid != bytenr) { + ret = -ENOENT; + goto release; + } + memcpy(&iter->cur_key, &key, sizeof(key)); + iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + iter->end_ptr = (u32)(iter->item_ptr + + btrfs_item_size(path->nodes[0], path->slots[0])); + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + + /* + * Only support iteration on tree backref yet. + * + * This is an extra precaution for non skinny-metadata, where + * EXTENT_ITEM is also used for tree blocks, that we can only use + * extent flags to determine if it's a tree block. + */ + if (btrfs_extent_flags(path->nodes[0], ei) & BTRFS_EXTENT_FLAG_DATA) { + ret = -ENOTSUPP; + goto release; + } + iter->cur_ptr = (u32)(iter->item_ptr + sizeof(*ei)); + + /* If there is no inline backref, go search for keyed backref */ + if (iter->cur_ptr >= iter->end_ptr) { + ret = btrfs_next_item(extent_root, path); + + /* No inline nor keyed ref */ + if (ret > 0) { + ret = -ENOENT; + goto release; + } + if (ret < 0) + goto release; + + btrfs_item_key_to_cpu(path->nodes[0], &iter->cur_key, + path->slots[0]); + if (iter->cur_key.objectid != bytenr || + (iter->cur_key.type != BTRFS_SHARED_BLOCK_REF_KEY && + iter->cur_key.type != BTRFS_TREE_BLOCK_REF_KEY)) { + ret = -ENOENT; + goto release; + } + iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + iter->item_ptr = iter->cur_ptr; + iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size( + path->nodes[0], path->slots[0])); + } + + return 0; +release: + btrfs_backref_iter_release(iter); + return ret; +} + +/* + * Go to the next backref item of current bytenr, can be either inlined or + * keyed. + * + * Caller needs to check whether it's inline ref or not by iter->cur_key. + * + * Return 0 if we get next backref without problem. + * Return >0 if there is no extra backref for this bytenr. + * Return <0 if there is something wrong happened. + */ +int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) +{ + struct extent_buffer *eb = btrfs_backref_get_eb(iter); + struct btrfs_root *extent_root; + struct btrfs_path *path = iter->path; + struct btrfs_extent_inline_ref *iref; + int ret; + u32 size; + + if (btrfs_backref_iter_is_inline_ref(iter)) { + /* We're still inside the inline refs */ + ASSERT(iter->cur_ptr < iter->end_ptr); + + if (btrfs_backref_has_tree_block_info(iter)) { + /* First tree block info */ + size = sizeof(struct btrfs_tree_block_info); + } else { + /* Use inline ref type to determine the size */ + int type; + + iref = (struct btrfs_extent_inline_ref *) + ((unsigned long)iter->cur_ptr); + type = btrfs_extent_inline_ref_type(eb, iref); + + size = btrfs_extent_inline_ref_size(type); + } + iter->cur_ptr += size; + if (iter->cur_ptr < iter->end_ptr) + return 0; + + /* All inline items iterated, fall through */ + } + + /* We're at keyed items, there is no inline item, go to the next one */ + extent_root = btrfs_extent_root(iter->fs_info, iter->bytenr); + ret = btrfs_next_item(extent_root, iter->path); + if (ret) + return ret; + + btrfs_item_key_to_cpu(path->nodes[0], &iter->cur_key, path->slots[0]); + if (iter->cur_key.objectid != iter->bytenr || + (iter->cur_key.type != BTRFS_TREE_BLOCK_REF_KEY && + iter->cur_key.type != BTRFS_SHARED_BLOCK_REF_KEY)) + return 1; + iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + iter->cur_ptr = iter->item_ptr; + iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size(path->nodes[0], + path->slots[0]); + return 0; +} + +void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, + struct btrfs_backref_cache *cache, int is_reloc) +{ + int i; + + cache->rb_root = RB_ROOT; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + INIT_LIST_HEAD(&cache->pending[i]); + INIT_LIST_HEAD(&cache->changed); + INIT_LIST_HEAD(&cache->detached); + INIT_LIST_HEAD(&cache->leaves); + INIT_LIST_HEAD(&cache->pending_edge); + INIT_LIST_HEAD(&cache->useless_node); + cache->fs_info = fs_info; + cache->is_reloc = is_reloc; +} + +struct btrfs_backref_node *btrfs_backref_alloc_node( + struct btrfs_backref_cache *cache, u64 bytenr, int level) +{ + struct btrfs_backref_node *node; + + ASSERT(level >= 0 && level < BTRFS_MAX_LEVEL); + node = kzalloc(sizeof(*node), GFP_NOFS); + if (!node) + return node; + + INIT_LIST_HEAD(&node->list); + INIT_LIST_HEAD(&node->upper); + INIT_LIST_HEAD(&node->lower); + RB_CLEAR_NODE(&node->rb_node); + cache->nr_nodes++; + node->level = level; + node->bytenr = bytenr; + + return node; +} + +struct btrfs_backref_edge *btrfs_backref_alloc_edge( + struct btrfs_backref_cache *cache) +{ + struct btrfs_backref_edge *edge; + + edge = kzalloc(sizeof(*edge), GFP_NOFS); + if (edge) + cache->nr_edges++; + return edge; +} + +/* + * Drop the backref node from cache, also cleaning up all its + * upper edges and any uncached nodes in the path. + * + * This cleanup happens bottom up, thus the node should either + * be the lowest node in the cache or a detached node. + */ +void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node) +{ + struct btrfs_backref_node *upper; + struct btrfs_backref_edge *edge; + + if (!node) + return; + + BUG_ON(!node->lowest && !node->detached); + while (!list_empty(&node->upper)) { + edge = list_entry(node->upper.next, struct btrfs_backref_edge, + list[LOWER]); + upper = edge->node[UPPER]; + list_del(&edge->list[LOWER]); + list_del(&edge->list[UPPER]); + btrfs_backref_free_edge(cache, edge); + + /* + * Add the node to leaf node list if no other child block + * cached. + */ + if (list_empty(&upper->lower)) { + list_add_tail(&upper->lower, &cache->leaves); + upper->lowest = 1; + } + } + + btrfs_backref_drop_node(cache, node); +} + +/* + * Release all nodes/edges from current cache + */ +void btrfs_backref_release_cache(struct btrfs_backref_cache *cache) +{ + struct btrfs_backref_node *node; + int i; + + while (!list_empty(&cache->detached)) { + node = list_entry(cache->detached.next, + struct btrfs_backref_node, list); + btrfs_backref_cleanup_node(cache, node); + } + + while (!list_empty(&cache->leaves)) { + node = list_entry(cache->leaves.next, + struct btrfs_backref_node, lower); + btrfs_backref_cleanup_node(cache, node); + } + + cache->last_trans = 0; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + ASSERT(list_empty(&cache->pending[i])); + ASSERT(list_empty(&cache->pending_edge)); + ASSERT(list_empty(&cache->useless_node)); + ASSERT(list_empty(&cache->changed)); + ASSERT(list_empty(&cache->detached)); + ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); + ASSERT(!cache->nr_nodes); + ASSERT(!cache->nr_edges); +} + +/* + * Handle direct tree backref + * + * Direct tree backref means, the backref item shows its parent bytenr + * directly. This is for SHARED_BLOCK_REF backref (keyed or inlined). + * + * @ref_key: The converted backref key. + * For keyed backref, it's the item key. + * For inlined backref, objectid is the bytenr, + * type is btrfs_inline_ref_type, offset is + * btrfs_inline_ref_offset. + */ +static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, + struct btrfs_key *ref_key, + struct btrfs_backref_node *cur) +{ + struct btrfs_backref_edge *edge; + struct btrfs_backref_node *upper; + struct rb_node *rb_node; + + ASSERT(ref_key->type == BTRFS_SHARED_BLOCK_REF_KEY); + + /* Only reloc root uses backref pointing to itself */ + if (ref_key->objectid == ref_key->offset) { + struct btrfs_root *root; + + cur->is_reloc_root = 1; + /* Only reloc backref cache cares about a specific root */ + if (cache->is_reloc) { + root = find_reloc_root(cache->fs_info, cur->bytenr); + if (!root) + return -ENOENT; + cur->root = root; + } else { + /* + * For generic purpose backref cache, reloc root node + * is useless. + */ + list_add(&cur->list, &cache->useless_node); + } + return 0; + } + + edge = btrfs_backref_alloc_edge(cache); + if (!edge) + return -ENOMEM; + + rb_node = rb_simple_search(&cache->rb_root, ref_key->offset); + if (!rb_node) { + /* Parent node not yet cached */ + upper = btrfs_backref_alloc_node(cache, ref_key->offset, + cur->level + 1); + if (!upper) { + btrfs_backref_free_edge(cache, edge); + return -ENOMEM; + } + + /* + * Backrefs for the upper level block isn't cached, add the + * block to pending list + */ + list_add_tail(&edge->list[UPPER], &cache->pending_edge); + } else { + /* Parent node already cached */ + upper = rb_entry(rb_node, struct btrfs_backref_node, rb_node); + ASSERT(upper->checked); + INIT_LIST_HEAD(&edge->list[UPPER]); + } + btrfs_backref_link_edge(edge, cur, upper, LINK_LOWER); + return 0; +} + +/* + * Handle indirect tree backref + * + * Indirect tree backref means, we only know which tree the node belongs to. + * We still need to do a tree search to find out the parents. This is for + * TREE_BLOCK_REF backref (keyed or inlined). + * + * @trans: Transaction handle. + * @ref_key: The same as @ref_key in handle_direct_tree_backref() + * @tree_key: The first key of this tree block. + * @path: A clean (released) path, to avoid allocating path every time + * the function get called. + */ +static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, + struct btrfs_backref_cache *cache, + struct btrfs_path *path, + struct btrfs_key *ref_key, + struct btrfs_key *tree_key, + struct btrfs_backref_node *cur) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_backref_node *upper; + struct btrfs_backref_node *lower; + struct btrfs_backref_edge *edge; + struct extent_buffer *eb; + struct btrfs_root *root; + struct rb_node *rb_node; + int level; + bool need_check = true; + int ret; + + root = btrfs_get_fs_root(fs_info, ref_key->offset, false); + if (IS_ERR(root)) + return PTR_ERR(root); + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + cur->cowonly = 1; + + if (btrfs_root_level(&root->root_item) == cur->level) { + /* Tree root */ + ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr); + /* + * For reloc backref cache, we may ignore reloc root. But for + * general purpose backref cache, we can't rely on + * btrfs_should_ignore_reloc_root() as it may conflict with + * current running relocation and lead to missing root. + * + * For general purpose backref cache, reloc root detection is + * completely relying on direct backref (key->offset is parent + * bytenr), thus only do such check for reloc cache. + */ + if (btrfs_should_ignore_reloc_root(root) && cache->is_reloc) { + btrfs_put_root(root); + list_add(&cur->list, &cache->useless_node); + } else { + cur->root = root; + } + return 0; + } + + level = cur->level + 1; + + /* Search the tree to find parent blocks referring to the block */ + path->search_commit_root = 1; + path->skip_locking = 1; + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, tree_key, path, 0, 0); + path->lowest_level = 0; + if (ret < 0) { + btrfs_put_root(root); + return ret; + } + if (ret > 0 && path->slots[level] > 0) + path->slots[level]--; + + eb = path->nodes[level]; + if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) { + btrfs_err(fs_info, +"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", + cur->bytenr, level - 1, root->root_key.objectid, + tree_key->objectid, tree_key->type, tree_key->offset); + btrfs_put_root(root); + ret = -ENOENT; + goto out; + } + lower = cur; + + /* Add all nodes and edges in the path */ + for (; level < BTRFS_MAX_LEVEL; level++) { + if (!path->nodes[level]) { + ASSERT(btrfs_root_bytenr(&root->root_item) == + lower->bytenr); + /* Same as previous should_ignore_reloc_root() call */ + if (btrfs_should_ignore_reloc_root(root) && + cache->is_reloc) { + btrfs_put_root(root); + list_add(&lower->list, &cache->useless_node); + } else { + lower->root = root; + } + break; + } + + edge = btrfs_backref_alloc_edge(cache); + if (!edge) { + btrfs_put_root(root); + ret = -ENOMEM; + goto out; + } + + eb = path->nodes[level]; + rb_node = rb_simple_search(&cache->rb_root, eb->start); + if (!rb_node) { + upper = btrfs_backref_alloc_node(cache, eb->start, + lower->level + 1); + if (!upper) { + btrfs_put_root(root); + btrfs_backref_free_edge(cache, edge); + ret = -ENOMEM; + goto out; + } + upper->owner = btrfs_header_owner(eb); + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + upper->cowonly = 1; + + /* + * If we know the block isn't shared we can avoid + * checking its backrefs. + */ + if (btrfs_block_can_be_shared(trans, root, eb)) + upper->checked = 0; + else + upper->checked = 1; + + /* + * Add the block to pending list if we need to check its + * backrefs, we only do this once while walking up a + * tree as we will catch anything else later on. + */ + if (!upper->checked && need_check) { + need_check = false; + list_add_tail(&edge->list[UPPER], + &cache->pending_edge); + } else { + if (upper->checked) + need_check = true; + INIT_LIST_HEAD(&edge->list[UPPER]); + } + } else { + upper = rb_entry(rb_node, struct btrfs_backref_node, + rb_node); + ASSERT(upper->checked); + INIT_LIST_HEAD(&edge->list[UPPER]); + if (!upper->owner) + upper->owner = btrfs_header_owner(eb); + } + btrfs_backref_link_edge(edge, lower, upper, LINK_LOWER); + + if (rb_node) { + btrfs_put_root(root); + break; + } + lower = upper; + upper = NULL; + } +out: + btrfs_release_path(path); + return ret; +} + +/* + * Add backref node @cur into @cache. + * + * NOTE: Even if the function returned 0, @cur is not yet cached as its upper + * links aren't yet bi-directional. Needs to finish such links. + * Use btrfs_backref_finish_upper_links() to finish such linkage. + * + * @trans: Transaction handle. + * @path: Released path for indirect tree backref lookup + * @iter: Released backref iter for extent tree search + * @node_key: The first key of the tree block + */ +int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_backref_cache *cache, + struct btrfs_path *path, + struct btrfs_backref_iter *iter, + struct btrfs_key *node_key, + struct btrfs_backref_node *cur) +{ + struct btrfs_backref_edge *edge; + struct btrfs_backref_node *exist; + int ret; + + ret = btrfs_backref_iter_start(iter, cur->bytenr); + if (ret < 0) + return ret; + /* + * We skip the first btrfs_tree_block_info, as we don't use the key + * stored in it, but fetch it from the tree block + */ + if (btrfs_backref_has_tree_block_info(iter)) { + ret = btrfs_backref_iter_next(iter); + if (ret < 0) + goto out; + /* No extra backref? This means the tree block is corrupted */ + if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + WARN_ON(cur->checked); + if (!list_empty(&cur->upper)) { + /* + * The backref was added previously when processing backref of + * type BTRFS_TREE_BLOCK_REF_KEY + */ + ASSERT(list_is_singular(&cur->upper)); + edge = list_entry(cur->upper.next, struct btrfs_backref_edge, + list[LOWER]); + ASSERT(list_empty(&edge->list[UPPER])); + exist = edge->node[UPPER]; + /* + * Add the upper level block to pending list if we need check + * its backrefs + */ + if (!exist->checked) + list_add_tail(&edge->list[UPPER], &cache->pending_edge); + } else { + exist = NULL; + } + + for (; ret == 0; ret = btrfs_backref_iter_next(iter)) { + struct extent_buffer *eb; + struct btrfs_key key; + int type; + + cond_resched(); + eb = btrfs_backref_get_eb(iter); + + key.objectid = iter->bytenr; + if (btrfs_backref_iter_is_inline_ref(iter)) { + struct btrfs_extent_inline_ref *iref; + + /* Update key for inline backref */ + iref = (struct btrfs_extent_inline_ref *) + ((unsigned long)iter->cur_ptr); + type = btrfs_get_extent_inline_ref_type(eb, iref, + BTRFS_REF_TYPE_BLOCK); + if (type == BTRFS_REF_TYPE_INVALID) { + ret = -EUCLEAN; + goto out; + } + key.type = type; + key.offset = btrfs_extent_inline_ref_offset(eb, iref); + } else { + key.type = iter->cur_key.type; + key.offset = iter->cur_key.offset; + } + + /* + * Parent node found and matches current inline ref, no need to + * rebuild this node for this inline ref + */ + if (exist && + ((key.type == BTRFS_TREE_BLOCK_REF_KEY && + exist->owner == key.offset) || + (key.type == BTRFS_SHARED_BLOCK_REF_KEY && + exist->bytenr == key.offset))) { + exist = NULL; + continue; + } + + /* SHARED_BLOCK_REF means key.offset is the parent bytenr */ + if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { + ret = handle_direct_tree_backref(cache, &key, cur); + if (ret < 0) + goto out; + } else if (key.type == BTRFS_TREE_BLOCK_REF_KEY) { + /* + * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref + * offset means the root objectid. We need to search + * the tree to get its parent bytenr. + */ + ret = handle_indirect_tree_backref(trans, cache, path, + &key, node_key, cur); + if (ret < 0) + goto out; + } + /* + * Unrecognized tree backref items (if it can pass tree-checker) + * would be ignored. + */ + } + ret = 0; + cur->checked = 1; + WARN_ON(exist); +out: + btrfs_backref_iter_release(iter); + return ret; +} + +/* + * Finish the upwards linkage created by btrfs_backref_add_tree_node() + */ +int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *start) +{ + struct list_head *useless_node = &cache->useless_node; + struct btrfs_backref_edge *edge; + struct rb_node *rb_node; + LIST_HEAD(pending_edge); + + ASSERT(start->checked); + + /* Insert this node to cache if it's not COW-only */ + if (!start->cowonly) { + rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, + &start->rb_node); + if (rb_node) + btrfs_backref_panic(cache->fs_info, start->bytenr, + -EEXIST); + list_add_tail(&start->lower, &cache->leaves); + } + + /* + * Use breadth first search to iterate all related edges. + * + * The starting points are all the edges of this node + */ + list_for_each_entry(edge, &start->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &pending_edge); + + while (!list_empty(&pending_edge)) { + struct btrfs_backref_node *upper; + struct btrfs_backref_node *lower; + + edge = list_first_entry(&pending_edge, + struct btrfs_backref_edge, list[UPPER]); + list_del_init(&edge->list[UPPER]); + upper = edge->node[UPPER]; + lower = edge->node[LOWER]; + + /* Parent is detached, no need to keep any edges */ + if (upper->detached) { + list_del(&edge->list[LOWER]); + btrfs_backref_free_edge(cache, edge); + + /* Lower node is orphan, queue for cleanup */ + if (list_empty(&lower->upper)) + list_add(&lower->list, useless_node); + continue; + } + + /* + * All new nodes added in current build_backref_tree() haven't + * been linked to the cache rb tree. + * So if we have upper->rb_node populated, this means a cache + * hit. We only need to link the edge, as @upper and all its + * parents have already been linked. + */ + if (!RB_EMPTY_NODE(&upper->rb_node)) { + if (upper->lowest) { + list_del_init(&upper->lower); + upper->lowest = 0; + } + + list_add_tail(&edge->list[UPPER], &upper->lower); + continue; + } + + /* Sanity check, we shouldn't have any unchecked nodes */ + if (!upper->checked) { + ASSERT(0); + return -EUCLEAN; + } + + /* Sanity check, COW-only node has non-COW-only parent */ + if (start->cowonly != upper->cowonly) { + ASSERT(0); + return -EUCLEAN; + } + + /* Only cache non-COW-only (subvolume trees) tree blocks */ + if (!upper->cowonly) { + rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, + &upper->rb_node); + if (rb_node) { + btrfs_backref_panic(cache->fs_info, + upper->bytenr, -EEXIST); + return -EUCLEAN; + } + } + + list_add_tail(&edge->list[UPPER], &upper->lower); + + /* + * Also queue all the parent edges of this uncached node + * to finish the upper linkage + */ + list_for_each_entry(edge, &upper->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &pending_edge); + } + return 0; +} + +void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node) +{ + struct btrfs_backref_node *lower; + struct btrfs_backref_node *upper; + struct btrfs_backref_edge *edge; + + while (!list_empty(&cache->useless_node)) { + lower = list_first_entry(&cache->useless_node, + struct btrfs_backref_node, list); + list_del_init(&lower->list); + } + while (!list_empty(&cache->pending_edge)) { + edge = list_first_entry(&cache->pending_edge, + struct btrfs_backref_edge, list[UPPER]); + list_del(&edge->list[UPPER]); + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + upper = edge->node[UPPER]; + btrfs_backref_free_edge(cache, edge); + + /* + * Lower is no longer linked to any upper backref nodes and + * isn't in the cache, we can free it ourselves. + */ + if (list_empty(&lower->upper) && + RB_EMPTY_NODE(&lower->rb_node)) + list_add(&lower->list, &cache->useless_node); + + if (!RB_EMPTY_NODE(&upper->rb_node)) + continue; + + /* Add this guy's upper edges to the list to process */ + list_for_each_entry(edge, &upper->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], + &cache->pending_edge); + if (list_empty(&upper->upper)) + list_add(&upper->list, &cache->useless_node); + } + + while (!list_empty(&cache->useless_node)) { + lower = list_first_entry(&cache->useless_node, + struct btrfs_backref_node, list); + list_del_init(&lower->list); + if (lower == node) + node = NULL; + btrfs_backref_drop_node(cache, lower); + } + + btrfs_backref_cleanup_node(cache, node); + ASSERT(list_empty(&cache->useless_node) && + list_empty(&cache->pending_edge)); +} diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h new file mode 100644 index 0000000000..71d535e03d --- /dev/null +++ b/fs/btrfs/backref.h @@ -0,0 +1,556 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + */ + +#ifndef BTRFS_BACKREF_H +#define BTRFS_BACKREF_H + +#include +#include "messages.h" +#include "ulist.h" +#include "disk-io.h" +#include "extent_io.h" + +/* + * Used by implementations of iterate_extent_inodes_t (see definition below) to + * signal that backref iteration can stop immediately and no error happened. + * The value must be non-negative and must not be 0, 1 (which is a common return + * value from things like btrfs_search_slot() and used internally in the backref + * walking code) and different from BACKREF_FOUND_SHARED and + * BACKREF_FOUND_NOT_SHARED + */ +#define BTRFS_ITERATE_EXTENT_INODES_STOP 5 + +/* + * Should return 0 if no errors happened and iteration of backrefs should + * continue. Can return BTRFS_ITERATE_EXTENT_INODES_STOP or any other non-zero + * value to immediately stop iteration and possibly signal an error back to + * the caller. + */ +typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 num_bytes, + u64 root, void *ctx); + +/* + * Context and arguments for backref walking functions. Some of the fields are + * to be filled by the caller of such functions while other are filled by the + * functions themselves, as described below. + */ +struct btrfs_backref_walk_ctx { + /* + * The address of the extent for which we are doing backref walking. + * Can be either a data extent or a metadata extent. + * + * Must always be set by the top level caller. + */ + u64 bytenr; + /* + * Offset relative to the target extent. This is only used for data + * extents, and it's meaningful because we can have file extent items + * that point only to a section of a data extent ("bookend" extents), + * and we want to filter out any that don't point to a section of the + * data extent containing the given offset. + * + * Must always be set by the top level caller. + */ + u64 extent_item_pos; + /* + * If true and bytenr corresponds to a data extent, then references from + * all file extent items that point to the data extent are considered, + * @extent_item_pos is ignored. + */ + bool ignore_extent_item_pos; + /* + * If true and bytenr corresponds to a data extent, then the inode list + * (each member describing inode number, file offset and root) is not + * added to each reference added to the @refs ulist. + */ + bool skip_inode_ref_list; + /* A valid transaction handle or NULL. */ + struct btrfs_trans_handle *trans; + /* + * The file system's info object, can not be NULL. + * + * Must always be set by the top level caller. + */ + struct btrfs_fs_info *fs_info; + /* + * Time sequence acquired from btrfs_get_tree_mod_seq(), in case the + * caller joined the tree mod log to get a consistent view of b+trees + * while we do backref walking, or BTRFS_SEQ_LAST. + * When using BTRFS_SEQ_LAST, delayed refs are not checked and it uses + * commit roots when searching b+trees - this is a special case for + * qgroups used during a transaction commit. + */ + u64 time_seq; + /* + * Used to collect the bytenr of metadata extents that point to the + * target extent. + */ + struct ulist *refs; + /* + * List used to collect the IDs of the roots from which the target + * extent is accessible. Can be NULL in case the caller does not care + * about collecting root IDs. + */ + struct ulist *roots; + /* + * Used by iterate_extent_inodes() and the main backref walk code + * (find_parent_nodes()). Lookup and store functions for an optional + * cache which maps the logical address (bytenr) of leaves to an array + * of root IDs. + */ + bool (*cache_lookup)(u64 leaf_bytenr, void *user_ctx, + const u64 **root_ids_ret, int *root_count_ret); + void (*cache_store)(u64 leaf_bytenr, const struct ulist *root_ids, + void *user_ctx); + /* + * If this is not NULL, then the backref walking code will call this + * for each indirect data extent reference as soon as it finds one, + * before collecting all the remaining backrefs and before resolving + * indirect backrefs. This allows for the caller to terminate backref + * walking as soon as it finds one backref that matches some specific + * criteria. The @cache_lookup and @cache_store callbacks should not + * be NULL in order to use this callback. + */ + iterate_extent_inodes_t *indirect_ref_iterator; + /* + * If this is not NULL, then the backref walking code will call this for + * each extent item it's meant to process before it actually starts + * processing it. If this returns anything other than 0, then it stops + * the backref walking code immediately. + */ + int (*check_extent_item)(u64 bytenr, const struct btrfs_extent_item *ei, + const struct extent_buffer *leaf, void *user_ctx); + /* + * If this is not NULL, then the backref walking code will call this for + * each extent data ref it finds (BTRFS_EXTENT_DATA_REF_KEY keys) before + * processing that data ref. If this callback return false, then it will + * ignore this data ref and it will never resolve the indirect data ref, + * saving time searching for leaves in a fs tree with file extent items + * matching the data ref. + */ + bool (*skip_data_ref)(u64 root, u64 ino, u64 offset, void *user_ctx); + /* Context object to pass to the callbacks defined above. */ + void *user_ctx; +}; + +struct inode_fs_paths { + struct btrfs_path *btrfs_path; + struct btrfs_root *fs_root; + struct btrfs_data_container *fspath; +}; + +struct btrfs_backref_shared_cache_entry { + u64 bytenr; + u64 gen; + bool is_shared; +}; + +#define BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE 8 + +struct btrfs_backref_share_check_ctx { + /* Ulists used during backref walking. */ + struct ulist refs; + /* + * The current leaf the caller of btrfs_is_data_extent_shared() is at. + * Typically the caller (at the moment only fiemap) tries to determine + * the sharedness of data extents point by file extent items from entire + * leaves. + */ + u64 curr_leaf_bytenr; + /* + * The previous leaf the caller was at in the previous call to + * btrfs_is_data_extent_shared(). This may be the same as the current + * leaf. On the first call it must be 0. + */ + u64 prev_leaf_bytenr; + /* + * A path from a root to a leaf that has a file extent item pointing to + * a given data extent should never exceed the maximum b+tree height. + */ + struct btrfs_backref_shared_cache_entry path_cache_entries[BTRFS_MAX_LEVEL]; + bool use_path_cache; + /* + * Cache the sharedness result for the last few extents we have found, + * but only for extents for which we have multiple file extent items + * that point to them. + * It's very common to have several file extent items that point to the + * same extent (bytenr) but with different offsets and lengths. This + * typically happens for COW writes, partial writes into prealloc + * extents, NOCOW writes after snapshoting a root, hole punching or + * reflinking within the same file (less common perhaps). + * So keep a small cache with the lookup results for the extent pointed + * by the last few file extent items. This cache is checked, with a + * linear scan, whenever btrfs_is_data_extent_shared() is called, so + * it must be small so that it does not negatively affect performance in + * case we don't have multiple file extent items that point to the same + * data extent. + */ + struct { + u64 bytenr; + bool is_shared; + } prev_extents_cache[BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE]; + /* + * The slot in the prev_extents_cache array that will be used for + * storing the sharedness result of a new data extent. + */ + int prev_extents_cache_slot; +}; + +struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void); +void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx); + +int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, + struct btrfs_path *path, struct btrfs_key *found_key, + u64 *flags); + +int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_key *key, struct btrfs_extent_item *ei, + u32 item_size, u64 *out_root, u8 *out_level); + +int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, + bool search_commit_root, + iterate_extent_inodes_t *iterate, void *user_ctx); + +int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, + struct btrfs_path *path, void *ctx, + bool ignore_offset); + +int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); + +int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx); +int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx, + bool skip_commit_root_sem); +char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + u32 name_len, unsigned long name_off, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size); + +struct btrfs_data_container *init_data_container(u32 total_bytes); +struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, + struct btrfs_path *path); +void free_ipath(struct inode_fs_paths *ipath); + +int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, + u64 start_off, struct btrfs_path *path, + struct btrfs_inode_extref **ret_extref, + u64 *found_off); +int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, + u64 extent_gen, + struct btrfs_backref_share_check_ctx *ctx); + +int __init btrfs_prelim_ref_init(void); +void __cold btrfs_prelim_ref_exit(void); + +struct prelim_ref { + struct rb_node rbnode; + u64 root_id; + struct btrfs_key key_for_search; + int level; + int count; + struct extent_inode_elem *inode_list; + u64 parent; + u64 wanted_disk_byte; +}; + +/* + * Iterate backrefs of one extent. + * + * Now it only supports iteration of tree block in commit root. + */ +struct btrfs_backref_iter { + u64 bytenr; + struct btrfs_path *path; + struct btrfs_fs_info *fs_info; + struct btrfs_key cur_key; + u32 item_ptr; + u32 cur_ptr; + u32 end_ptr; +}; + +struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info); + +static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter) +{ + if (!iter) + return; + btrfs_free_path(iter->path); + kfree(iter); +} + +static inline struct extent_buffer *btrfs_backref_get_eb( + struct btrfs_backref_iter *iter) +{ + if (!iter) + return NULL; + return iter->path->nodes[0]; +} + +/* + * For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data + * is btrfs_tree_block_info, without a btrfs_extent_inline_ref header. + * + * This helper determines if that's the case. + */ +static inline bool btrfs_backref_has_tree_block_info( + struct btrfs_backref_iter *iter) +{ + if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY && + iter->cur_ptr - iter->item_ptr == sizeof(struct btrfs_extent_item)) + return true; + return false; +} + +int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr); + +int btrfs_backref_iter_next(struct btrfs_backref_iter *iter); + +static inline bool btrfs_backref_iter_is_inline_ref( + struct btrfs_backref_iter *iter) +{ + if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY || + iter->cur_key.type == BTRFS_METADATA_ITEM_KEY) + return true; + return false; +} + +static inline void btrfs_backref_iter_release(struct btrfs_backref_iter *iter) +{ + iter->bytenr = 0; + iter->item_ptr = 0; + iter->cur_ptr = 0; + iter->end_ptr = 0; + btrfs_release_path(iter->path); + memset(&iter->cur_key, 0, sizeof(iter->cur_key)); +} + +/* + * Backref cache related structures + * + * The whole objective of backref_cache is to build a bi-directional map + * of tree blocks (represented by backref_node) and all their parents. + */ + +/* + * Represent a tree block in the backref cache + */ +struct btrfs_backref_node { + struct { + struct rb_node rb_node; + u64 bytenr; + }; /* Use rb_simple_node for search/insert */ + + u64 new_bytenr; + /* Objectid of tree block owner, can be not uptodate */ + u64 owner; + /* Link to pending, changed or detached list */ + struct list_head list; + + /* List of upper level edges, which link this node to its parents */ + struct list_head upper; + /* List of lower level edges, which link this node to its children */ + struct list_head lower; + + /* NULL if this node is not tree root */ + struct btrfs_root *root; + /* Extent buffer got by COWing the block */ + struct extent_buffer *eb; + /* Level of the tree block */ + unsigned int level:8; + /* Is the block in a non-shareable tree */ + unsigned int cowonly:1; + /* 1 if no child node is in the cache */ + unsigned int lowest:1; + /* Is the extent buffer locked */ + unsigned int locked:1; + /* Has the block been processed */ + unsigned int processed:1; + /* Have backrefs of this block been checked */ + unsigned int checked:1; + /* + * 1 if corresponding block has been COWed but some upper level block + * pointers may not point to the new location + */ + unsigned int pending:1; + /* 1 if the backref node isn't connected to any other backref node */ + unsigned int detached:1; + + /* + * For generic purpose backref cache, where we only care if it's a reloc + * root, doesn't care the source subvolid. + */ + unsigned int is_reloc_root:1; +}; + +#define LOWER 0 +#define UPPER 1 + +/* + * Represent an edge connecting upper and lower backref nodes. + */ +struct btrfs_backref_edge { + /* + * list[LOWER] is linked to btrfs_backref_node::upper of lower level + * node, and list[UPPER] is linked to btrfs_backref_node::lower of + * upper level node. + * + * Also, build_backref_tree() uses list[UPPER] for pending edges, before + * linking list[UPPER] to its upper level nodes. + */ + struct list_head list[2]; + + /* Two related nodes */ + struct btrfs_backref_node *node[2]; +}; + +struct btrfs_backref_cache { + /* Red black tree of all backref nodes in the cache */ + struct rb_root rb_root; + /* For passing backref nodes to btrfs_reloc_cow_block */ + struct btrfs_backref_node *path[BTRFS_MAX_LEVEL]; + /* + * List of blocks that have been COWed but some block pointers in upper + * level blocks may not reflect the new location + */ + struct list_head pending[BTRFS_MAX_LEVEL]; + /* List of backref nodes with no child node */ + struct list_head leaves; + /* List of blocks that have been COWed in current transaction */ + struct list_head changed; + /* List of detached backref node. */ + struct list_head detached; + + u64 last_trans; + + int nr_nodes; + int nr_edges; + + /* List of unchecked backref edges during backref cache build */ + struct list_head pending_edge; + + /* List of useless backref nodes during backref cache build */ + struct list_head useless_node; + + struct btrfs_fs_info *fs_info; + + /* + * Whether this cache is for relocation + * + * Reloction backref cache require more info for reloc root compared + * to generic backref cache. + */ + unsigned int is_reloc; +}; + +void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, + struct btrfs_backref_cache *cache, int is_reloc); +struct btrfs_backref_node *btrfs_backref_alloc_node( + struct btrfs_backref_cache *cache, u64 bytenr, int level); +struct btrfs_backref_edge *btrfs_backref_alloc_edge( + struct btrfs_backref_cache *cache); + +#define LINK_LOWER (1 << 0) +#define LINK_UPPER (1 << 1) +static inline void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, + struct btrfs_backref_node *lower, + struct btrfs_backref_node *upper, + int link_which) +{ + ASSERT(upper && lower && upper->level == lower->level + 1); + edge->node[LOWER] = lower; + edge->node[UPPER] = upper; + if (link_which & LINK_LOWER) + list_add_tail(&edge->list[LOWER], &lower->upper); + if (link_which & LINK_UPPER) + list_add_tail(&edge->list[UPPER], &upper->lower); +} + +static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node) +{ + if (node) { + ASSERT(list_empty(&node->list)); + ASSERT(list_empty(&node->lower)); + ASSERT(node->eb == NULL); + cache->nr_nodes--; + btrfs_put_root(node->root); + kfree(node); + } +} + +static inline void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, + struct btrfs_backref_edge *edge) +{ + if (edge) { + cache->nr_edges--; + kfree(edge); + } +} + +static inline void btrfs_backref_unlock_node_buffer( + struct btrfs_backref_node *node) +{ + if (node->locked) { + btrfs_tree_unlock(node->eb); + node->locked = 0; + } +} + +static inline void btrfs_backref_drop_node_buffer( + struct btrfs_backref_node *node) +{ + if (node->eb) { + btrfs_backref_unlock_node_buffer(node); + free_extent_buffer(node->eb); + node->eb = NULL; + } +} + +/* + * Drop the backref node from cache without cleaning up its children + * edges. + * + * This can only be called on node without parent edges. + * The children edges are still kept as is. + */ +static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, + struct btrfs_backref_node *node) +{ + ASSERT(list_empty(&node->upper)); + + btrfs_backref_drop_node_buffer(node); + list_del_init(&node->list); + list_del_init(&node->lower); + if (!RB_EMPTY_NODE(&node->rb_node)) + rb_erase(&node->rb_node, &tree->rb_root); + btrfs_backref_free_node(tree, node); +} + +void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node); + +void btrfs_backref_release_cache(struct btrfs_backref_cache *cache); + +static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info, + u64 bytenr, int errno) +{ + btrfs_panic(fs_info, errno, + "Inconsistency in backref cache found at offset %llu", + bytenr); +} + +int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_backref_cache *cache, + struct btrfs_path *path, + struct btrfs_backref_iter *iter, + struct btrfs_key *node_key, + struct btrfs_backref_node *cur); + +int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *start); + +void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node); + +#endif diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c new file mode 100644 index 0000000000..12b12443ef --- /dev/null +++ b/fs/btrfs/bio.c @@ -0,0 +1,872 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2022 Christoph Hellwig. + */ + +#include +#include "bio.h" +#include "ctree.h" +#include "volumes.h" +#include "raid56.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "dev-replace.h" +#include "rcu-string.h" +#include "zoned.h" +#include "file-item.h" + +static struct bio_set btrfs_bioset; +static struct bio_set btrfs_clone_bioset; +static struct bio_set btrfs_repair_bioset; +static mempool_t btrfs_failed_bio_pool; + +struct btrfs_failed_bio { + struct btrfs_bio *bbio; + int num_copies; + atomic_t repair_count; +}; + +/* Is this a data path I/O that needs storage layer checksum and repair? */ +static inline bool is_data_bbio(struct btrfs_bio *bbio) +{ + return bbio->inode && is_data_inode(&bbio->inode->vfs_inode); +} + +static bool bbio_has_ordered_extent(struct btrfs_bio *bbio) +{ + return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE; +} + +/* + * Initialize a btrfs_bio structure. This skips the embedded bio itself as it + * is already initialized by the block layer. + */ +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, + btrfs_bio_end_io_t end_io, void *private) +{ + memset(bbio, 0, offsetof(struct btrfs_bio, bio)); + bbio->fs_info = fs_info; + bbio->end_io = end_io; + bbio->private = private; + atomic_set(&bbio->pending_ios, 1); +} + +/* + * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for + * btrfs, and is used for all I/O submitted through btrfs_submit_bio. + * + * Just like the underlying bio_alloc_bioset it will not fail as it is backed by + * a mempool. + */ +struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_fs_info *fs_info, + btrfs_bio_end_io_t end_io, void *private) +{ + struct btrfs_bio *bbio; + struct bio *bio; + + bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio, fs_info, end_io, private); + return bbio; +} + +static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, + struct btrfs_bio *orig_bbio, + u64 map_length, bool use_append) +{ + struct btrfs_bio *bbio; + struct bio *bio; + + if (use_append) { + unsigned int nr_segs; + + bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs, + &btrfs_clone_bioset, map_length); + } else { + bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, + GFP_NOFS, &btrfs_clone_bioset); + } + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); + bbio->inode = orig_bbio->inode; + bbio->file_offset = orig_bbio->file_offset; + orig_bbio->file_offset += map_length; + if (bbio_has_ordered_extent(bbio)) { + refcount_inc(&orig_bbio->ordered->refs); + bbio->ordered = orig_bbio->ordered; + } + atomic_inc(&orig_bbio->pending_ios); + return bbio; +} + +/* Free a bio that was never submitted to the underlying device. */ +static void btrfs_cleanup_bio(struct btrfs_bio *bbio) +{ + if (bbio_has_ordered_extent(bbio)) + btrfs_put_ordered_extent(bbio->ordered); + bio_put(&bbio->bio); +} + +static void __btrfs_bio_end_io(struct btrfs_bio *bbio) +{ + if (bbio_has_ordered_extent(bbio)) { + struct btrfs_ordered_extent *ordered = bbio->ordered; + + bbio->end_io(bbio); + btrfs_put_ordered_extent(ordered); + } else { + bbio->end_io(bbio); + } +} + +void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) +{ + bbio->bio.bi_status = status; + __btrfs_bio_end_io(bbio); +} + +static void btrfs_orig_write_end_io(struct bio *bio); + +static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, + struct btrfs_bio *orig_bbio) +{ + /* + * For writes we tolerate nr_mirrors - 1 write failures, so we can't + * just blindly propagate a write failure here. Instead increment the + * error count in the original I/O context so that it is guaranteed to + * be larger than the error tolerance. + */ + if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { + struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; + struct btrfs_io_context *orig_bioc = orig_stripe->bioc; + + atomic_add(orig_bioc->max_errors, &orig_bioc->error); + } else { + orig_bbio->bio.bi_status = bbio->bio.bi_status; + } +} + +static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) +{ + if (bbio->bio.bi_pool == &btrfs_clone_bioset) { + struct btrfs_bio *orig_bbio = bbio->private; + + if (bbio->bio.bi_status) + btrfs_bbio_propagate_error(bbio, orig_bbio); + btrfs_cleanup_bio(bbio); + bbio = orig_bbio; + } + + if (atomic_dec_and_test(&bbio->pending_ios)) + __btrfs_bio_end_io(bbio); +} + +static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +{ + if (cur_mirror == fbio->num_copies) + return cur_mirror + 1 - fbio->num_copies; + return cur_mirror + 1; +} + +static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +{ + if (cur_mirror == 1) + return fbio->num_copies; + return cur_mirror - 1; +} + +static void btrfs_repair_done(struct btrfs_failed_bio *fbio) +{ + if (atomic_dec_and_test(&fbio->repair_count)) { + btrfs_orig_bbio_end_io(fbio->bbio); + mempool_free(fbio, &btrfs_failed_bio_pool); + } +} + +static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, + struct btrfs_device *dev) +{ + struct btrfs_failed_bio *fbio = repair_bbio->private; + struct btrfs_inode *inode = repair_bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); + int mirror = repair_bbio->mirror_num; + + if (repair_bbio->bio.bi_status || + !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { + bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); + repair_bbio->bio.bi_iter = repair_bbio->saved_iter; + + mirror = next_repair_mirror(fbio, mirror); + if (mirror == fbio->bbio->mirror_num) { + btrfs_debug(fs_info, "no mirror left"); + fbio->bbio->bio.bi_status = BLK_STS_IOERR; + goto done; + } + + btrfs_submit_bio(repair_bbio, mirror); + return; + } + + do { + mirror = prev_repair_mirror(fbio, mirror); + btrfs_repair_io_failure(fs_info, btrfs_ino(inode), + repair_bbio->file_offset, fs_info->sectorsize, + repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, + bv->bv_page, bv->bv_offset, mirror); + } while (mirror != fbio->bbio->mirror_num); + +done: + btrfs_repair_done(fbio); + bio_put(&repair_bbio->bio); +} + +/* + * Try to kick off a repair read to the next available mirror for a bad sector. + * + * This primarily tries to recover good data to serve the actual read request, + * but also tries to write the good data back to the bad mirror(s) when a + * read succeeded to restore the redundancy. + */ +static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, + u32 bio_offset, + struct bio_vec *bv, + struct btrfs_failed_bio *fbio) +{ + struct btrfs_inode *inode = failed_bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 sectorsize = fs_info->sectorsize; + const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); + struct btrfs_bio *repair_bbio; + struct bio *repair_bio; + int num_copies; + int mirror; + + btrfs_debug(fs_info, "repair read error: read error at %llu", + failed_bbio->file_offset + bio_offset); + + num_copies = btrfs_num_copies(fs_info, logical, sectorsize); + if (num_copies == 1) { + btrfs_debug(fs_info, "no copy to repair from"); + failed_bbio->bio.bi_status = BLK_STS_IOERR; + return fbio; + } + + if (!fbio) { + fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS); + fbio->bbio = failed_bbio; + fbio->num_copies = num_copies; + atomic_set(&fbio->repair_count, 1); + } + + atomic_inc(&fbio->repair_count); + + repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, + &btrfs_repair_bioset); + repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; + __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); + + repair_bbio = btrfs_bio(repair_bio); + btrfs_bio_init(repair_bbio, fs_info, NULL, fbio); + repair_bbio->inode = failed_bbio->inode; + repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; + + mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); + btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); + btrfs_submit_bio(repair_bbio, mirror); + return fbio; +} + +static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) +{ + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u32 sectorsize = fs_info->sectorsize; + struct bvec_iter *iter = &bbio->saved_iter; + blk_status_t status = bbio->bio.bi_status; + struct btrfs_failed_bio *fbio = NULL; + u32 offset = 0; + + /* Read-repair requires the inode field to be set by the submitter. */ + ASSERT(inode); + + /* + * Hand off repair bios to the repair code as there is no upper level + * submitter for them. + */ + if (bbio->bio.bi_pool == &btrfs_repair_bioset) { + btrfs_end_repair_bio(bbio, dev); + return; + } + + /* Clear the I/O error. A failed repair will reset it. */ + bbio->bio.bi_status = BLK_STS_OK; + + while (iter->bi_size) { + struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); + + bv.bv_len = min(bv.bv_len, sectorsize); + if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) + fbio = repair_one_sector(bbio, offset, &bv, fbio); + + bio_advance_iter_single(&bbio->bio, iter, sectorsize); + offset += sectorsize; + } + + if (bbio->csum != bbio->csum_inline) + kfree(bbio->csum); + + if (fbio) + btrfs_repair_done(fbio); + else + btrfs_orig_bbio_end_io(bbio); +} + +static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) +{ + if (!dev || !dev->bdev) + return; + if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) + return; + + if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + else if (!(bio->bi_opf & REQ_RAHEAD)) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); + if (bio->bi_opf & REQ_PREFLUSH) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); +} + +static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, + struct bio *bio) +{ + if (bio->bi_opf & REQ_META) + return fs_info->endio_meta_workers; + return fs_info->endio_workers; +} + +static void btrfs_end_bio_work(struct work_struct *work) +{ + struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); + + /* Metadata reads are checked and repaired by the submitter. */ + if (is_data_bbio(bbio)) + btrfs_check_read_bio(bbio, bbio->bio.bi_private); + else + btrfs_orig_bbio_end_io(bbio); +} + +static void btrfs_simple_end_io(struct bio *bio) +{ + struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_device *dev = bio->bi_private; + struct btrfs_fs_info *fs_info = bbio->fs_info; + + btrfs_bio_counter_dec(fs_info); + + if (bio->bi_status) + btrfs_log_dev_io_error(bio, dev); + + if (bio_op(bio) == REQ_OP_READ) { + INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); + queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); + } else { + if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + btrfs_record_physical_zoned(bbio); + btrfs_orig_bbio_end_io(bbio); + } +} + +static void btrfs_raid56_end_io(struct bio *bio) +{ + struct btrfs_io_context *bioc = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(bioc->fs_info); + bbio->mirror_num = bioc->mirror_num; + if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) + btrfs_check_read_bio(bbio, NULL); + else + btrfs_orig_bbio_end_io(bbio); + + btrfs_put_bioc(bioc); +} + +static void btrfs_orig_write_end_io(struct bio *bio) +{ + struct btrfs_io_stripe *stripe = bio->bi_private; + struct btrfs_io_context *bioc = stripe->bioc; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(bioc->fs_info); + + if (bio->bi_status) { + atomic_inc(&bioc->error); + btrfs_log_dev_io_error(bio, stripe->dev); + } + + /* + * Only send an error to the higher layers if it is beyond the tolerance + * threshold. + */ + if (atomic_read(&bioc->error) > bioc->max_errors) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_status = BLK_STS_OK; + + btrfs_orig_bbio_end_io(bbio); + btrfs_put_bioc(bioc); +} + +static void btrfs_clone_write_end_io(struct bio *bio) +{ + struct btrfs_io_stripe *stripe = bio->bi_private; + + if (bio->bi_status) { + atomic_inc(&stripe->bioc->error); + btrfs_log_dev_io_error(bio, stripe->dev); + } + + /* Pass on control to the original bio this one was cloned from */ + bio_endio(stripe->bioc->orig_bio); + bio_put(bio); +} + +static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) +{ + if (!dev || !dev->bdev || + test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || + (btrfs_op(bio) == BTRFS_MAP_WRITE && + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { + bio_io_error(bio); + return; + } + + bio_set_dev(bio, dev->bdev); + + /* + * For zone append writing, bi_sector must point the beginning of the + * zone + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 zone_start = round_down(physical, dev->fs_info->zone_size); + + ASSERT(btrfs_dev_is_sequential(dev, physical)); + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; + } + btrfs_debug_in_rcu(dev->fs_info, + "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", + __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, + (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), + dev->devid, bio->bi_iter.bi_size); + + btrfsic_check_bio(bio); + + if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) + blkcg_punt_bio_submit(bio); + else + submit_bio(bio); +} + +static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) +{ + struct bio *orig_bio = bioc->orig_bio, *bio; + + ASSERT(bio_op(orig_bio) != REQ_OP_READ); + + /* Reuse the bio embedded into the btrfs_bio for the last mirror */ + if (dev_nr == bioc->num_stripes - 1) { + bio = orig_bio; + bio->bi_end_io = btrfs_orig_write_end_io; + } else { + bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); + bio_inc_remaining(orig_bio); + bio->bi_end_io = btrfs_clone_write_end_io; + } + + bio->bi_private = &bioc->stripes[dev_nr]; + bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; + bioc->stripes[dev_nr].bioc = bioc; + btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); +} + +static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) +{ + if (!bioc) { + /* Single mirror read/write fast path. */ + btrfs_bio(bio)->mirror_num = mirror_num; + bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; + if (bio_op(bio) != REQ_OP_READ) + btrfs_bio(bio)->orig_physical = smap->physical; + bio->bi_private = smap->dev; + bio->bi_end_io = btrfs_simple_end_io; + btrfs_submit_dev_bio(smap->dev, bio); + } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + /* Parity RAID write or read recovery. */ + bio->bi_private = bioc; + bio->bi_end_io = btrfs_raid56_end_io; + if (bio_op(bio) == REQ_OP_READ) + raid56_parity_recover(bio, bioc, mirror_num); + else + raid56_parity_write(bio, bioc); + } else { + /* Write to multiple mirrors. */ + int total_devs = bioc->num_stripes; + + bioc->orig_bio = bio; + for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) + btrfs_submit_mirrored_bio(bioc, dev_nr); + } +} + +static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) +{ + if (bbio->bio.bi_opf & REQ_META) + return btree_csum_one_bio(bbio); + return btrfs_csum_one_bio(bbio); +} + +/* + * Async submit bios are used to offload expensive checksumming onto the worker + * threads. + */ +struct async_submit_bio { + struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe smap; + int mirror_num; + struct btrfs_work work; +}; + +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the btree. + */ +static void run_one_async_start(struct btrfs_work *work) +{ + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + blk_status_t ret; + + ret = btrfs_bio_csum(async->bbio); + if (ret) + async->bbio->bio.bi_status = ret; +} + +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the tree. + */ +static void run_one_async_done(struct btrfs_work *work) +{ + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + struct bio *bio = &async->bbio->bio; + + /* If an error occurred we just want to clean up the bio and move on. */ + if (bio->bi_status) { + btrfs_orig_bbio_end_io(async->bbio); + return; + } + + /* + * All of the bios that pass through here are from async helpers. + * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's + * context. This changes nothing when cgroups aren't in use. + */ + bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; + __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); +} + +static void run_one_async_free(struct btrfs_work *work) +{ + kfree(container_of(work, struct async_submit_bio, work)); +} + +static bool should_async_write(struct btrfs_bio *bbio) +{ + /* Submit synchronously if the checksum implementation is fast. */ + if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) + return false; + + /* + * Try to defer the submission to a workqueue to parallelize the + * checksum calculation unless the I/O is issued synchronously. + */ + if (op_is_sync(bbio->bio.bi_opf)) + return false; + + /* Zoned devices require I/O to be submitted in order. */ + if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info)) + return false; + + return true; +} + +/* + * Submit bio to an async queue. + * + * Return true if the work has been succesfuly submitted, else false. + */ +static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, + struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) +{ + struct btrfs_fs_info *fs_info = bbio->fs_info; + struct async_submit_bio *async; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return false; + + async->bbio = bbio; + async->bioc = bioc; + async->smap = *smap; + async->mirror_num = mirror_num; + + btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, + run_one_async_free); + btrfs_queue_work(fs_info->workers, &async->work); + return true; +} + +static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) +{ + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_bio *orig_bbio = bbio; + struct bio *bio = &bbio->bio; + u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 length = bio->bi_iter.bi_size; + u64 map_length = length; + bool use_append = btrfs_use_zone_append(bbio); + struct btrfs_io_context *bioc = NULL; + struct btrfs_io_stripe smap; + blk_status_t ret; + int error; + + btrfs_bio_counter_inc_blocked(fs_info); + error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num, 1); + if (error) { + ret = errno_to_blk_status(error); + goto fail; + } + + map_length = min(map_length, length); + if (use_append) + map_length = min(map_length, fs_info->max_zone_append_size); + + if (map_length < length) { + bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append); + bio = &bbio->bio; + } + + /* + * Save the iter for the end_io handler and preload the checksums for + * data reads. + */ + if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) { + bbio->saved_iter = bio->bi_iter; + ret = btrfs_lookup_bio_sums(bbio); + if (ret) + goto fail_put_bio; + } + + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + if (use_append) { + bio->bi_opf &= ~REQ_OP_WRITE; + bio->bi_opf |= REQ_OP_ZONE_APPEND; + } + + /* + * Csum items for reloc roots have already been cloned at this + * point, so they are handled as part of the no-checksum case. + */ + if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && + !btrfs_is_data_reloc_root(inode->root)) { + if (should_async_write(bbio) && + btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) + goto done; + + ret = btrfs_bio_csum(bbio); + if (ret) + goto fail_put_bio; + } else if (use_append) { + ret = btrfs_alloc_dummy_sum(bbio); + if (ret) + goto fail_put_bio; + } + } + + __btrfs_submit_bio(bio, bioc, &smap, mirror_num); +done: + return map_length == length; + +fail_put_bio: + if (map_length < length) + btrfs_cleanup_bio(bbio); +fail: + btrfs_bio_counter_dec(fs_info); + btrfs_bio_end_io(orig_bbio, ret); + /* Do not submit another chunk */ + return true; +} + +void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) +{ + /* If bbio->inode is not populated, its file_offset must be 0. */ + ASSERT(bbio->inode || bbio->file_offset == 0); + + while (!btrfs_submit_chunk(bbio, mirror_num)) + ; +} + +/* + * Submit a repair write. + * + * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a + * RAID setup. Here we only want to write the one bad copy, so we do the + * mapping ourselves and submit the bio directly. + * + * The I/O is issued synchronously to block the repair read completion from + * freeing the bio. + */ +int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num) +{ + struct btrfs_io_stripe smap = { 0 }; + struct bio_vec bvec; + struct bio bio; + int ret = 0; + + ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); + BUG_ON(!mirror_num); + + if (btrfs_repair_one_zone(fs_info, logical)) + return 0; + + /* + * Avoid races with device replace and make sure our bioc has devices + * associated to its stripes that don't go away while we are doing the + * read repair operation. + */ + btrfs_bio_counter_inc_blocked(fs_info); + ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); + if (ret < 0) + goto out_counter_dec; + + if (!smap.dev->bdev || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) { + ret = -EIO; + goto out_counter_dec; + } + + bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); + bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; + __bio_add_page(&bio, page, length, pg_offset); + + btrfsic_check_bio(&bio); + ret = submit_bio_wait(&bio); + if (ret) { + /* try to remap that extent elsewhere? */ + btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS); + goto out_bio_uninit; + } + + btrfs_info_rl_in_rcu(fs_info, + "read error corrected: ino %llu off %llu (dev %s sector %llu)", + ino, start, btrfs_dev_name(smap.dev), + smap.physical >> SECTOR_SHIFT); + ret = 0; + +out_bio_uninit: + bio_uninit(&bio); +out_counter_dec: + btrfs_bio_counter_dec(fs_info); + return ret; +} + +/* + * Submit a btrfs_bio based repair write. + * + * If @dev_replace is true, the write would be submitted to dev-replace target. + */ +void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace) +{ + struct btrfs_fs_info *fs_info = bbio->fs_info; + u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + u64 length = bbio->bio.bi_iter.bi_size; + struct btrfs_io_stripe smap = { 0 }; + int ret; + + ASSERT(fs_info); + ASSERT(mirror_num > 0); + ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE); + ASSERT(!bbio->inode); + + btrfs_bio_counter_inc_blocked(fs_info); + ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); + if (ret < 0) + goto fail; + + if (dev_replace) { + ASSERT(smap.dev == fs_info->dev_replace.srcdev); + smap.dev = fs_info->dev_replace.tgtdev; + } + __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); + return; + +fail: + btrfs_bio_counter_dec(fs_info); + btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); +} + +int __init btrfs_bioset_init(void) +{ + if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), + BIOSET_NEED_BVECS)) + return -ENOMEM; + if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), 0)) + goto out_free_bioset; + if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), + BIOSET_NEED_BVECS)) + goto out_free_clone_bioset; + if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, + sizeof(struct btrfs_failed_bio))) + goto out_free_repair_bioset; + return 0; + +out_free_repair_bioset: + bioset_exit(&btrfs_repair_bioset); +out_free_clone_bioset: + bioset_exit(&btrfs_clone_bioset); +out_free_bioset: + bioset_exit(&btrfs_bioset); + return -ENOMEM; +} + +void __cold btrfs_bioset_exit(void) +{ + mempool_exit(&btrfs_failed_bio_pool); + bioset_exit(&btrfs_repair_bioset); + bioset_exit(&btrfs_clone_bioset); + bioset_exit(&btrfs_bioset); +} diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h new file mode 100644 index 0000000000..ca79decee0 --- /dev/null +++ b/fs/btrfs/bio.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2022 Christoph Hellwig. + */ + +#ifndef BTRFS_BIO_H +#define BTRFS_BIO_H + +#include +#include +#include "tree-checker.h" + +struct btrfs_bio; +struct btrfs_fs_info; + +#define BTRFS_BIO_INLINE_CSUM_SIZE 64 + +/* + * Maximum number of sectors for a single bio to limit the size of the + * checksum array. This matches the number of bio_vecs per bio and thus the + * I/O size for buffered I/O. + */ +#define BTRFS_MAX_BIO_SECTORS (256) + +typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); + +/* + * Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and + * passed to btrfs_submit_bio for mapping to the physical devices. + */ +struct btrfs_bio { + /* + * Inode and offset into it that this I/O operates on. + * Only set for data I/O. + */ + struct btrfs_inode *inode; + u64 file_offset; + + union { + /* + * For data reads: checksumming and original I/O information. + * (for internal use in the btrfs_submit_bio machinery only) + */ + struct { + u8 *csum; + u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + struct bvec_iter saved_iter; + }; + + /* + * For data writes: + * - ordered extent covering the bio + * - pointer to the checksums for this bio + * - original physical address from the allocator + * (for zone append only) + */ + struct { + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_sum *sums; + u64 orig_physical; + }; + + /* For metadata reads: parentness verification. */ + struct btrfs_tree_parent_check parent_check; + }; + + /* End I/O information supplied to btrfs_bio_alloc */ + btrfs_bio_end_io_t end_io; + void *private; + + /* For internal use in read end I/O handling */ + unsigned int mirror_num; + atomic_t pending_ios; + struct work_struct end_io_work; + + /* File system that this I/O operates on. */ + struct btrfs_fs_info *fs_info; + + /* + * This member must come last, bio_alloc_bioset will allocate enough + * bytes for entire btrfs_bio but relies on bio being last. + */ + struct bio bio; +}; + +static inline struct btrfs_bio *btrfs_bio(struct bio *bio) +{ + return container_of(bio, struct btrfs_bio, bio); +} + +int __init btrfs_bioset_init(void); +void __cold btrfs_bioset_exit(void); + +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, + btrfs_bio_end_io_t end_io, void *private); +struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_fs_info *fs_info, + btrfs_bio_end_io_t end_io, void *private); +void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); + +/* Submit using blkcg_punt_bio_submit. */ +#define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE + +void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num); +void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); +int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num); + +#endif diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c new file mode 100644 index 0000000000..5a97db9888 --- /dev/null +++ b/fs/btrfs/block-group.c @@ -0,0 +1,4545 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "misc.h" +#include "ctree.h" +#include "block-group.h" +#include "space-info.h" +#include "disk-io.h" +#include "free-space-cache.h" +#include "free-space-tree.h" +#include "volumes.h" +#include "transaction.h" +#include "ref-verify.h" +#include "sysfs.h" +#include "tree-log.h" +#include "delalloc-space.h" +#include "discard.h" +#include "raid56.h" +#include "zoned.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" + +#ifdef CONFIG_BTRFS_DEBUG +int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + + return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && + block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || + (btrfs_test_opt(fs_info, FRAGMENT_DATA) && + block_group->flags & BTRFS_BLOCK_GROUP_DATA); +} +#endif + +/* + * Return target flags in extended format or 0 if restripe for this chunk_type + * is not in progress + * + * Should be called with balance_lock held + */ +static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + u64 target = 0; + + if (!bctl) + return 0; + + if (flags & BTRFS_BLOCK_GROUP_DATA && + bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && + bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if (flags & BTRFS_BLOCK_GROUP_METADATA && + bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + return target; +} + +/* + * @flags: available profiles in extended format (see ctree.h) + * + * Return reduced profile in chunk format. If profile changing is in progress + * (either running or paused) picks the target profile (if it's already + * available), otherwise falls back to plain reducing. + */ +static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 num_devices = fs_info->fs_devices->rw_devices; + u64 target; + u64 raid_type; + u64 allowed = 0; + + /* + * See if restripe for this chunk_type is in progress, if so try to + * reduce to the target profile + */ + spin_lock(&fs_info->balance_lock); + target = get_restripe_target(fs_info, flags); + if (target) { + spin_unlock(&fs_info->balance_lock); + return extended_to_chunk(target); + } + spin_unlock(&fs_info->balance_lock); + + /* First, mask out the RAID levels which aren't possible */ + for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { + if (num_devices >= btrfs_raid_array[raid_type].devs_min) + allowed |= btrfs_raid_array[raid_type].bg_flag; + } + allowed &= flags; + + /* Select the highest-redundancy RAID level. */ + if (allowed & BTRFS_BLOCK_GROUP_RAID1C4) + allowed = BTRFS_BLOCK_GROUP_RAID1C4; + else if (allowed & BTRFS_BLOCK_GROUP_RAID6) + allowed = BTRFS_BLOCK_GROUP_RAID6; + else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3) + allowed = BTRFS_BLOCK_GROUP_RAID1C3; + else if (allowed & BTRFS_BLOCK_GROUP_RAID5) + allowed = BTRFS_BLOCK_GROUP_RAID5; + else if (allowed & BTRFS_BLOCK_GROUP_RAID10) + allowed = BTRFS_BLOCK_GROUP_RAID10; + else if (allowed & BTRFS_BLOCK_GROUP_RAID1) + allowed = BTRFS_BLOCK_GROUP_RAID1; + else if (allowed & BTRFS_BLOCK_GROUP_DUP) + allowed = BTRFS_BLOCK_GROUP_DUP; + else if (allowed & BTRFS_BLOCK_GROUP_RAID0) + allowed = BTRFS_BLOCK_GROUP_RAID0; + + flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; + + return extended_to_chunk(flags | allowed); +} + +u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) +{ + unsigned seq; + u64 flags; + + do { + flags = orig_flags; + seq = read_seqbegin(&fs_info->profiles_lock); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + flags |= fs_info->avail_data_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + flags |= fs_info->avail_system_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + flags |= fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&fs_info->profiles_lock, seq)); + + return btrfs_reduce_alloc_profile(fs_info, flags); +} + +void btrfs_get_block_group(struct btrfs_block_group *cache) +{ + refcount_inc(&cache->refs); +} + +void btrfs_put_block_group(struct btrfs_block_group *cache) +{ + if (refcount_dec_and_test(&cache->refs)) { + WARN_ON(cache->pinned > 0); + /* + * If there was a failure to cleanup a log tree, very likely due + * to an IO failure on a writeback attempt of one or more of its + * extent buffers, we could not do proper (and cheap) unaccounting + * of their reserved space, so don't warn on reserved > 0 in that + * case. + */ + if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info)) + WARN_ON(cache->reserved > 0); + + /* + * A block_group shouldn't be on the discard_list anymore. + * Remove the block_group from the discard_list to prevent us + * from causing a panic due to NULL pointer dereference. + */ + if (WARN_ON(!list_empty(&cache->discard_list))) + btrfs_discard_cancel_work(&cache->fs_info->discard_ctl, + cache); + + kfree(cache->free_space_ctl); + kfree(cache->physical_map); + kfree(cache); + } +} + +/* + * This adds the block group to the fs_info rb tree for the block group cache + */ +static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, + struct btrfs_block_group *block_group) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_block_group *cache; + bool leftmost = true; + + ASSERT(block_group->length != 0); + + write_lock(&info->block_group_cache_lock); + p = &info->block_group_cache_tree.rb_root.rb_node; + + while (*p) { + parent = *p; + cache = rb_entry(parent, struct btrfs_block_group, cache_node); + if (block_group->start < cache->start) { + p = &(*p)->rb_left; + } else if (block_group->start > cache->start) { + p = &(*p)->rb_right; + leftmost = false; + } else { + write_unlock(&info->block_group_cache_lock); + return -EEXIST; + } + } + + rb_link_node(&block_group->cache_node, parent, p); + rb_insert_color_cached(&block_group->cache_node, + &info->block_group_cache_tree, leftmost); + + write_unlock(&info->block_group_cache_lock); + + return 0; +} + +/* + * This will return the block group at or after bytenr if contains is 0, else + * it will return the block group that contains the bytenr + */ +static struct btrfs_block_group *block_group_cache_tree_search( + struct btrfs_fs_info *info, u64 bytenr, int contains) +{ + struct btrfs_block_group *cache, *ret = NULL; + struct rb_node *n; + u64 end, start; + + read_lock(&info->block_group_cache_lock); + n = info->block_group_cache_tree.rb_root.rb_node; + + while (n) { + cache = rb_entry(n, struct btrfs_block_group, cache_node); + end = cache->start + cache->length - 1; + start = cache->start; + + if (bytenr < start) { + if (!contains && (!ret || start < ret->start)) + ret = cache; + n = n->rb_left; + } else if (bytenr > start) { + if (contains && bytenr <= end) { + ret = cache; + break; + } + n = n->rb_right; + } else { + ret = cache; + break; + } + } + if (ret) + btrfs_get_block_group(ret); + read_unlock(&info->block_group_cache_lock); + + return ret; +} + +/* + * Return the block group that starts at or after bytenr + */ +struct btrfs_block_group *btrfs_lookup_first_block_group( + struct btrfs_fs_info *info, u64 bytenr) +{ + return block_group_cache_tree_search(info, bytenr, 0); +} + +/* + * Return the block group that contains the given bytenr + */ +struct btrfs_block_group *btrfs_lookup_block_group( + struct btrfs_fs_info *info, u64 bytenr) +{ + return block_group_cache_tree_search(info, bytenr, 1); +} + +struct btrfs_block_group *btrfs_next_block_group( + struct btrfs_block_group *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct rb_node *node; + + read_lock(&fs_info->block_group_cache_lock); + + /* If our block group was removed, we need a full search. */ + if (RB_EMPTY_NODE(&cache->cache_node)) { + const u64 next_bytenr = cache->start + cache->length; + + read_unlock(&fs_info->block_group_cache_lock); + btrfs_put_block_group(cache); + return btrfs_lookup_first_block_group(fs_info, next_bytenr); + } + node = rb_next(&cache->cache_node); + btrfs_put_block_group(cache); + if (node) { + cache = rb_entry(node, struct btrfs_block_group, cache_node); + btrfs_get_block_group(cache); + } else + cache = NULL; + read_unlock(&fs_info->block_group_cache_lock); + return cache; +} + +/* + * Check if we can do a NOCOW write for a given extent. + * + * @fs_info: The filesystem information object. + * @bytenr: Logical start address of the extent. + * + * Check if we can do a NOCOW write for the given extent, and increments the + * number of NOCOW writers in the block group that contains the extent, as long + * as the block group exists and it's currently not in read-only mode. + * + * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller + * is responsible for calling btrfs_dec_nocow_writers() later. + * + * Or NULL if we can not do a NOCOW write + */ +struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, + u64 bytenr) +{ + struct btrfs_block_group *bg; + bool can_nocow = true; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg) + return NULL; + + spin_lock(&bg->lock); + if (bg->ro) + can_nocow = false; + else + atomic_inc(&bg->nocow_writers); + spin_unlock(&bg->lock); + + if (!can_nocow) { + btrfs_put_block_group(bg); + return NULL; + } + + /* No put on block group, done by btrfs_dec_nocow_writers(). */ + return bg; +} + +/* + * Decrement the number of NOCOW writers in a block group. + * + * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), + * and on the block group returned by that call. Typically this is called after + * creating an ordered extent for a NOCOW write, to prevent races with scrub and + * relocation. + * + * After this call, the caller should not use the block group anymore. It it wants + * to use it, then it should get a reference on it before calling this function. + */ +void btrfs_dec_nocow_writers(struct btrfs_block_group *bg) +{ + if (atomic_dec_and_test(&bg->nocow_writers)) + wake_up_var(&bg->nocow_writers); + + /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */ + btrfs_put_block_group(bg); +} + +void btrfs_wait_nocow_writers(struct btrfs_block_group *bg) +{ + wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); +} + +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start) +{ + struct btrfs_block_group *bg; + + bg = btrfs_lookup_block_group(fs_info, start); + ASSERT(bg); + if (atomic_dec_and_test(&bg->reservations)) + wake_up_var(&bg->reservations); + btrfs_put_block_group(bg); +} + +void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg) +{ + struct btrfs_space_info *space_info = bg->space_info; + + ASSERT(bg->ro); + + if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) + return; + + /* + * Our block group is read only but before we set it to read only, + * some task might have had allocated an extent from it already, but it + * has not yet created a respective ordered extent (and added it to a + * root's list of ordered extents). + * Therefore wait for any task currently allocating extents, since the + * block group's reservations counter is incremented while a read lock + * on the groups' semaphore is held and decremented after releasing + * the read access on that semaphore and creating the ordered extent. + */ + down_write(&space_info->groups_sem); + up_write(&space_info->groups_sem); + + wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); +} + +struct btrfs_caching_control *btrfs_get_caching_control( + struct btrfs_block_group *cache) +{ + struct btrfs_caching_control *ctl; + + spin_lock(&cache->lock); + if (!cache->caching_ctl) { + spin_unlock(&cache->lock); + return NULL; + } + + ctl = cache->caching_ctl; + refcount_inc(&ctl->count); + spin_unlock(&cache->lock); + return ctl; +} + +void btrfs_put_caching_control(struct btrfs_caching_control *ctl) +{ + if (refcount_dec_and_test(&ctl->count)) + kfree(ctl); +} + +/* + * When we wait for progress in the block group caching, its because our + * allocation attempt failed at least once. So, we must sleep and let some + * progress happen before we try again. + * + * This function will sleep at least once waiting for new free space to show + * up, and then it will check the block group free space numbers for our min + * num_bytes. Another option is to have it go ahead and look in the rbtree for + * a free extent of a given size, but this is a good start. + * + * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using + * any of the information in this block group. + */ +void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, + u64 num_bytes) +{ + struct btrfs_caching_control *caching_ctl; + int progress; + + caching_ctl = btrfs_get_caching_control(cache); + if (!caching_ctl) + return; + + /* + * We've already failed to allocate from this block group, so even if + * there's enough space in the block group it isn't contiguous enough to + * allow for an allocation, so wait for at least the next wakeup tick, + * or for the thing to be done. + */ + progress = atomic_read(&caching_ctl->progress); + + wait_event(caching_ctl->wait, btrfs_block_group_done(cache) || + (progress != atomic_read(&caching_ctl->progress) && + (cache->free_space_ctl->free_space >= num_bytes))); + + btrfs_put_caching_control(caching_ctl); +} + +static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache, + struct btrfs_caching_control *caching_ctl) +{ + wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); + return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0; +} + +static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) +{ + struct btrfs_caching_control *caching_ctl; + int ret; + + caching_ctl = btrfs_get_caching_control(cache); + if (!caching_ctl) + return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; + ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); + btrfs_put_caching_control(caching_ctl); + return ret; +} + +#ifdef CONFIG_BTRFS_DEBUG +static void fragment_free_space(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + u64 start = block_group->start; + u64 len = block_group->length; + u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? + fs_info->nodesize : fs_info->sectorsize; + u64 step = chunk << 1; + + while (len > chunk) { + btrfs_remove_free_space(block_group, start, chunk); + start += step; + if (len < step) + len = 0; + else + len -= step; + } +} +#endif + +/* + * Add a free space range to the in memory free space cache of a block group. + * This checks if the range contains super block locations and any such + * locations are not added to the free space cache. + * + * @block_group: The target block group. + * @start: Start offset of the range. + * @end: End offset of the range (exclusive). + * @total_added_ret: Optional pointer to return the total amount of space + * added to the block group's free space cache. + * + * Returns 0 on success or < 0 on error. + */ +int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, + u64 end, u64 *total_added_ret) +{ + struct btrfs_fs_info *info = block_group->fs_info; + u64 extent_start, extent_end, size; + int ret; + + if (total_added_ret) + *total_added_ret = 0; + + while (start < end) { + if (!find_first_extent_bit(&info->excluded_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY | EXTENT_UPTODATE, + NULL)) + break; + + if (extent_start <= start) { + start = extent_end + 1; + } else if (extent_start > start && extent_start < end) { + size = extent_start - start; + ret = btrfs_add_free_space_async_trimmed(block_group, + start, size); + if (ret) + return ret; + if (total_added_ret) + *total_added_ret += size; + start = extent_end + 1; + } else { + break; + } + } + + if (start < end) { + size = end - start; + ret = btrfs_add_free_space_async_trimmed(block_group, start, + size); + if (ret) + return ret; + if (total_added_ret) + *total_added_ret += size; + } + + return 0; +} + +/* + * Get an arbitrary extent item index / max_index through the block group + * + * @block_group the block group to sample from + * @index: the integral step through the block group to grab from + * @max_index: the granularity of the sampling + * @key: return value parameter for the item we find + * + * Pre-conditions on indices: + * 0 <= index <= max_index + * 0 < max_index + * + * Returns: 0 on success, 1 if the search didn't yield a useful item, negative + * error code on error. + */ +static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, + struct btrfs_block_group *block_group, + int index, int max_index, + struct btrfs_key *found_key) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *extent_root; + u64 search_offset; + u64 search_end = block_group->start + block_group->length; + struct btrfs_path *path; + struct btrfs_key search_key; + int ret = 0; + + ASSERT(index >= 0); + ASSERT(index <= max_index); + ASSERT(max_index > 0); + lockdep_assert_held(&caching_ctl->mutex); + lockdep_assert_held_read(&fs_info->commit_root_sem); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, + BTRFS_SUPER_INFO_OFFSET)); + + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = READA_FORWARD; + + search_offset = index * div_u64(block_group->length, max_index); + search_key.objectid = block_group->start + search_offset; + search_key.type = BTRFS_EXTENT_ITEM_KEY; + search_key.offset = 0; + + btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) { + /* Success; sampled an extent item in the block group */ + if (found_key->type == BTRFS_EXTENT_ITEM_KEY && + found_key->objectid >= block_group->start && + found_key->objectid + found_key->offset <= search_end) + break; + + /* We can't possibly find a valid extent item anymore */ + if (found_key->objectid >= search_end) { + ret = 1; + break; + } + } + + lockdep_assert_held(&caching_ctl->mutex); + lockdep_assert_held_read(&fs_info->commit_root_sem); + btrfs_free_path(path); + return ret; +} + +/* + * Best effort attempt to compute a block group's size class while caching it. + * + * @block_group: the block group we are caching + * + * We cannot infer the size class while adding free space extents, because that + * logic doesn't care about contiguous file extents (it doesn't differentiate + * between a 100M extent and 100 contiguous 1M extents). So we need to read the + * file extent items. Reading all of them is quite wasteful, because usually + * only a handful are enough to give a good answer. Therefore, we just grab 5 of + * them at even steps through the block group and pick the smallest size class + * we see. Since size class is best effort, and not guaranteed in general, + * inaccuracy is acceptable. + * + * To be more explicit about why this algorithm makes sense: + * + * If we are caching in a block group from disk, then there are three major cases + * to consider: + * 1. the block group is well behaved and all extents in it are the same size + * class. + * 2. the block group is mostly one size class with rare exceptions for last + * ditch allocations + * 3. the block group was populated before size classes and can have a totally + * arbitrary mix of size classes. + * + * In case 1, looking at any extent in the block group will yield the correct + * result. For the mixed cases, taking the minimum size class seems like a good + * approximation, since gaps from frees will be usable to the size class. For + * 2., a small handful of file extents is likely to yield the right answer. For + * 3, we can either read every file extent, or admit that this is best effort + * anyway and try to stay fast. + * + * Returns: 0 on success, negative error code on error. + */ +static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl, + struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_key key; + int i; + u64 min_size = block_group->length; + enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; + int ret; + + if (!btrfs_block_group_should_use_size_class(block_group)) + return 0; + + lockdep_assert_held(&caching_ctl->mutex); + lockdep_assert_held_read(&fs_info->commit_root_sem); + for (i = 0; i < 5; ++i) { + ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); + if (ret < 0) + goto out; + if (ret > 0) + continue; + min_size = min_t(u64, min_size, key.offset); + size_class = btrfs_calc_block_group_size_class(min_size); + } + if (size_class != BTRFS_BG_SZ_NONE) { + spin_lock(&block_group->lock); + block_group->size_class = size_class; + spin_unlock(&block_group->lock); + } +out: + return ret; +} + +static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) +{ + struct btrfs_block_group *block_group = caching_ctl->block_group; + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *extent_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + u64 total_found = 0; + u64 last = 0; + u32 nritems; + int ret; + bool wakeup = true; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET); + extent_root = btrfs_extent_root(fs_info, last); + +#ifdef CONFIG_BTRFS_DEBUG + /* + * If we're fragmenting we don't want to make anybody think we can + * allocate from this block group until we've had a chance to fragment + * the free space. + */ + if (btrfs_should_fragment_free_space(block_group)) + wakeup = false; +#endif + /* + * We don't want to deadlock with somebody trying to allocate a new + * extent for the extent root while also trying to search the extent + * root to add free space. So we skip locking and search the commit + * root, since its read-only + */ + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = READA_FORWARD; + + key.objectid = last; + key.offset = 0; + key.type = BTRFS_EXTENT_ITEM_KEY; + +next: + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + + while (1) { + if (btrfs_fs_closing(fs_info) > 1) { + last = (u64)-1; + break; + } + + if (path->slots[0] < nritems) { + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + } else { + ret = btrfs_find_next_key(extent_root, path, &key, 0, 0); + if (ret) + break; + + if (need_resched() || + rwsem_is_contended(&fs_info->commit_root_sem)) { + btrfs_release_path(path); + up_read(&fs_info->commit_root_sem); + mutex_unlock(&caching_ctl->mutex); + cond_resched(); + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + goto next; + } + + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto out; + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + continue; + } + + if (key.objectid < last) { + key.objectid = last; + key.offset = 0; + key.type = BTRFS_EXTENT_ITEM_KEY; + btrfs_release_path(path); + goto next; + } + + if (key.objectid < block_group->start) { + path->slots[0]++; + continue; + } + + if (key.objectid >= block_group->start + block_group->length) + break; + + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { + u64 space_added; + + ret = btrfs_add_new_free_space(block_group, last, + key.objectid, &space_added); + if (ret) + goto out; + total_found += space_added; + if (key.type == BTRFS_METADATA_ITEM_KEY) + last = key.objectid + + fs_info->nodesize; + else + last = key.objectid + key.offset; + + if (total_found > CACHING_CTL_WAKE_UP) { + total_found = 0; + if (wakeup) { + atomic_inc(&caching_ctl->progress); + wake_up(&caching_ctl->wait); + } + } + } + path->slots[0]++; + } + + ret = btrfs_add_new_free_space(block_group, last, + block_group->start + block_group->length, + NULL); +out: + btrfs_free_path(path); + return ret; +} + +static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) +{ + clear_extent_bits(&bg->fs_info->excluded_extents, bg->start, + bg->start + bg->length - 1, EXTENT_UPTODATE); +} + +static noinline void caching_thread(struct btrfs_work *work) +{ + struct btrfs_block_group *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_caching_control *caching_ctl; + int ret; + + caching_ctl = container_of(work, struct btrfs_caching_control, work); + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + + load_block_group_size_class(caching_ctl, block_group); + if (btrfs_test_opt(fs_info, SPACE_CACHE)) { + ret = load_free_space_cache(block_group); + if (ret == 1) { + ret = 0; + goto done; + } + + /* + * We failed to load the space cache, set ourselves to + * CACHE_STARTED and carry on. + */ + spin_lock(&block_group->lock); + block_group->cached = BTRFS_CACHE_STARTED; + spin_unlock(&block_group->lock); + wake_up(&caching_ctl->wait); + } + + /* + * If we are in the transaction that populated the free space tree we + * can't actually cache from the free space tree as our commit root and + * real root are the same, so we could change the contents of the blocks + * while caching. Instead do the slow caching in this case, and after + * the transaction has committed we will be safe. + */ + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags))) + ret = load_free_space_tree(caching_ctl); + else + ret = load_extent_tree_free(caching_ctl); +done: + spin_lock(&block_group->lock); + block_group->caching_ctl = NULL; + block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; + spin_unlock(&block_group->lock); + +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(block_group)) { + u64 bytes_used; + + spin_lock(&block_group->space_info->lock); + spin_lock(&block_group->lock); + bytes_used = block_group->length - block_group->used; + block_group->space_info->bytes_used += bytes_used >> 1; + spin_unlock(&block_group->lock); + spin_unlock(&block_group->space_info->lock); + fragment_free_space(block_group); + } +#endif + + up_read(&fs_info->commit_root_sem); + btrfs_free_excluded_extents(block_group); + mutex_unlock(&caching_ctl->mutex); + + wake_up(&caching_ctl->wait); + + btrfs_put_caching_control(caching_ctl); + btrfs_put_block_group(block_group); +} + +int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_caching_control *caching_ctl = NULL; + int ret = 0; + + /* Allocator for zoned filesystems does not use the cache at all */ + if (btrfs_is_zoned(fs_info)) + return 0; + + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); + if (!caching_ctl) + return -ENOMEM; + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + refcount_set(&caching_ctl->count, 2); + atomic_set(&caching_ctl->progress, 0); + btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); + + spin_lock(&cache->lock); + if (cache->cached != BTRFS_CACHE_NO) { + kfree(caching_ctl); + + caching_ctl = cache->caching_ctl; + if (caching_ctl) + refcount_inc(&caching_ctl->count); + spin_unlock(&cache->lock); + goto out; + } + WARN_ON(cache->caching_ctl); + cache->caching_ctl = caching_ctl; + cache->cached = BTRFS_CACHE_STARTED; + spin_unlock(&cache->lock); + + write_lock(&fs_info->block_group_cache_lock); + refcount_inc(&caching_ctl->count); + list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); + write_unlock(&fs_info->block_group_cache_lock); + + btrfs_get_block_group(cache); + + btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); +out: + if (wait && caching_ctl) + ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); + if (caching_ctl) + btrfs_put_caching_control(caching_ctl); + + return ret; +} + +static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; + + write_seqlock(&fs_info->profiles_lock); + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits &= ~extra_flags; + write_sequnlock(&fs_info->profiles_lock); +} + +/* + * Clear incompat bits for the following feature(s): + * + * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group + * in the whole filesystem + * + * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups + */ +static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + bool found_raid56 = false; + bool found_raid1c34 = false; + + if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) || + (flags & BTRFS_BLOCK_GROUP_RAID1C3) || + (flags & BTRFS_BLOCK_GROUP_RAID1C4)) { + struct list_head *head = &fs_info->space_info; + struct btrfs_space_info *sinfo; + + list_for_each_entry_rcu(sinfo, head, list) { + down_read(&sinfo->groups_sem); + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) + found_raid56 = true; + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) + found_raid56 = true; + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3])) + found_raid1c34 = true; + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4])) + found_raid1c34 = true; + up_read(&sinfo->groups_sem); + } + if (!found_raid56) + btrfs_clear_fs_incompat(fs_info, RAID56); + if (!found_raid1c34) + btrfs_clear_fs_incompat(fs_info, RAID1C34); + } +} + +static int remove_block_group_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root; + struct btrfs_key key; + int ret; + + root = btrfs_block_group_root(fs_info); + key.objectid = block_group->start; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = block_group->length; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + return ret; + + ret = btrfs_del_item(trans, root, path); + return ret; +} + +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + u64 group_start, struct extent_map *em) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_path *path; + struct btrfs_block_group *block_group; + struct btrfs_free_cluster *cluster; + struct inode *inode; + struct kobject *kobj = NULL; + int ret; + int index; + int factor; + struct btrfs_caching_control *caching_ctl = NULL; + bool remove_em; + bool remove_rsv = false; + + block_group = btrfs_lookup_block_group(fs_info, group_start); + BUG_ON(!block_group); + BUG_ON(!block_group->ro); + + trace_btrfs_remove_block_group(block_group); + /* + * Free the reserved super bytes from this block group before + * remove it. + */ + btrfs_free_excluded_extents(block_group); + btrfs_free_ref_tree_range(fs_info, block_group->start, + block_group->length); + + index = btrfs_bg_flags_to_raid_index(block_group->flags); + factor = btrfs_bg_type_to_factor(block_group->flags); + + /* make sure this block group isn't part of an allocation cluster */ + cluster = &fs_info->data_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + /* + * make sure this block group isn't part of a metadata + * allocation cluster + */ + cluster = &fs_info->meta_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + btrfs_clear_treelog_bg(block_group); + btrfs_clear_data_reloc_bg(block_group); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + /* + * get the inode first so any iput calls done for the io_list + * aren't the final iput (no unlinks allowed now) + */ + inode = lookup_free_space_inode(block_group, path); + + mutex_lock(&trans->transaction->cache_write_mutex); + /* + * Make sure our free space cache IO is done before removing the + * free space inode + */ + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->io_list)) { + list_del_init(&block_group->io_list); + + WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); + + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_wait_cache_io(trans, block_group, path); + btrfs_put_block_group(block_group); + spin_lock(&trans->transaction->dirty_bgs_lock); + } + + if (!list_empty(&block_group->dirty_list)) { + list_del_init(&block_group->dirty_list); + remove_rsv = true; + btrfs_put_block_group(block_group); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + mutex_unlock(&trans->transaction->cache_write_mutex); + + ret = btrfs_remove_free_space_inode(trans, inode, block_group); + if (ret) + goto out; + + write_lock(&fs_info->block_group_cache_lock); + rb_erase_cached(&block_group->cache_node, + &fs_info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); + + /* Once for the block groups rbtree */ + btrfs_put_block_group(block_group); + + write_unlock(&fs_info->block_group_cache_lock); + + down_write(&block_group->space_info->groups_sem); + /* + * we must use list_del_init so people can check to see if they + * are still on the list after taking the semaphore + */ + list_del_init(&block_group->list); + if (list_empty(&block_group->space_info->block_groups[index])) { + kobj = block_group->space_info->block_group_kobjs[index]; + block_group->space_info->block_group_kobjs[index] = NULL; + clear_avail_alloc_bits(fs_info, block_group->flags); + } + up_write(&block_group->space_info->groups_sem); + clear_incompat_bg_bits(fs_info, block_group->flags); + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } + + if (block_group->cached == BTRFS_CACHE_STARTED) + btrfs_wait_block_group_cache_done(block_group); + + write_lock(&fs_info->block_group_cache_lock); + caching_ctl = btrfs_get_caching_control(block_group); + if (!caching_ctl) { + struct btrfs_caching_control *ctl; + + list_for_each_entry(ctl, &fs_info->caching_block_groups, list) { + if (ctl->block_group == block_group) { + caching_ctl = ctl; + refcount_inc(&caching_ctl->count); + break; + } + } + } + if (caching_ctl) + list_del_init(&caching_ctl->list); + write_unlock(&fs_info->block_group_cache_lock); + + if (caching_ctl) { + /* Once for the caching bgs list and once for us. */ + btrfs_put_caching_control(caching_ctl); + btrfs_put_caching_control(caching_ctl); + } + + spin_lock(&trans->transaction->dirty_bgs_lock); + WARN_ON(!list_empty(&block_group->dirty_list)); + WARN_ON(!list_empty(&block_group->io_list)); + spin_unlock(&trans->transaction->dirty_bgs_lock); + + btrfs_remove_free_space_cache(block_group); + + spin_lock(&block_group->space_info->lock); + list_del_init(&block_group->ro_list); + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + WARN_ON(block_group->space_info->total_bytes + < block_group->length); + WARN_ON(block_group->space_info->bytes_readonly + < block_group->length - block_group->zone_unusable); + WARN_ON(block_group->space_info->bytes_zone_unusable + < block_group->zone_unusable); + WARN_ON(block_group->space_info->disk_total + < block_group->length * factor); + } + block_group->space_info->total_bytes -= block_group->length; + block_group->space_info->bytes_readonly -= + (block_group->length - block_group->zone_unusable); + block_group->space_info->bytes_zone_unusable -= + block_group->zone_unusable; + block_group->space_info->disk_total -= block_group->length * factor; + + spin_unlock(&block_group->space_info->lock); + + /* + * Remove the free space for the block group from the free space tree + * and the block group's item from the extent tree before marking the + * block group as removed. This is to prevent races with tasks that + * freeze and unfreeze a block group, this task and another task + * allocating a new block group - the unfreeze task ends up removing + * the block group's extent map before the task calling this function + * deletes the block group item from the extent tree, allowing for + * another task to attempt to create another block group with the same + * item key (and failing with -EEXIST and a transaction abort). + */ + ret = remove_block_group_free_space(trans, block_group); + if (ret) + goto out; + + ret = remove_block_group_item(trans, path, block_group); + if (ret < 0) + goto out; + + spin_lock(&block_group->lock); + set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags); + + /* + * At this point trimming or scrub can't start on this block group, + * because we removed the block group from the rbtree + * fs_info->block_group_cache_tree so no one can't find it anymore and + * even if someone already got this block group before we removed it + * from the rbtree, they have already incremented block_group->frozen - + * if they didn't, for the trimming case they won't find any free space + * entries because we already removed them all when we called + * btrfs_remove_free_space_cache(). + * + * And we must not remove the extent map from the fs_info->mapping_tree + * to prevent the same logical address range and physical device space + * ranges from being reused for a new block group. This is needed to + * avoid races with trimming and scrub. + * + * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is + * completely transactionless, so while it is trimming a range the + * currently running transaction might finish and a new one start, + * allowing for new block groups to be created that can reuse the same + * physical device locations unless we take this special care. + * + * There may also be an implicit trim operation if the file system + * is mounted with -odiscard. The same protections must remain + * in place until the extents have been discarded completely when + * the transaction commit has completed. + */ + remove_em = (atomic_read(&block_group->frozen) == 0); + spin_unlock(&block_group->lock); + + if (remove_em) { + struct extent_map_tree *em_tree; + + em_tree = &fs_info->mapping_tree; + write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + /* once for the tree */ + free_extent_map(em); + } + +out: + /* Once for the lookup reference */ + btrfs_put_block_group(block_group); + if (remove_rsv) + btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_free_path(path); + return ret; +} + +struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( + struct btrfs_fs_info *fs_info, const u64 chunk_offset) +{ + struct btrfs_root *root = btrfs_block_group_root(fs_info); + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + unsigned int num_items; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); + read_unlock(&em_tree->lock); + ASSERT(em && em->start == chunk_offset); + + /* + * We need to reserve 3 + N units from the metadata space info in order + * to remove a block group (done at btrfs_remove_chunk() and at + * btrfs_remove_block_group()), which are used for: + * + * 1 unit for adding the free space inode's orphan (located in the tree + * of tree roots). + * 1 unit for deleting the block group item (located in the extent + * tree). + * 1 unit for deleting the free space item (located in tree of tree + * roots). + * N units for deleting N device extent items corresponding to each + * stripe (located in the device tree). + * + * In order to remove a block group we also need to reserve units in the + * system space info in order to update the chunk tree (update one or + * more device items and remove one chunk item), but this is done at + * btrfs_remove_chunk() through a call to check_system_chunk(). + */ + map = em->map_lookup; + num_items = 3 + map->num_stripes; + free_extent_map(em); + + return btrfs_start_transaction_fallback_global_rsv(root, num_items); +} + +/* + * Mark block group @cache read-only, so later write won't happen to block + * group @cache. + * + * If @force is not set, this function will only mark the block group readonly + * if we have enough free space (1M) in other metadata/system block groups. + * If @force is not set, this function will mark the block group readonly + * without checking free space. + * + * NOTE: This function doesn't care if other block groups can contain all the + * data in this block group. That check should be done by relocation routine, + * not this function. + */ +static int inc_block_group_ro(struct btrfs_block_group *cache, int force) +{ + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + int ret = -ENOSPC; + + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + + if (cache->swap_extents) { + ret = -ETXTBSY; + goto out; + } + + if (cache->ro) { + cache->ro++; + ret = 0; + goto out; + } + + num_bytes = cache->length - cache->reserved - cache->pinned - + cache->bytes_super - cache->zone_unusable - cache->used; + + /* + * Data never overcommits, even in mixed mode, so do just the straight + * check of left over space in how much we have allocated. + */ + if (force) { + ret = 0; + } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) { + u64 sinfo_used = btrfs_space_info_used(sinfo, true); + + /* + * Here we make sure if we mark this bg RO, we still have enough + * free space as buffer. + */ + if (sinfo_used + num_bytes <= sinfo->total_bytes) + ret = 0; + } else { + /* + * We overcommit metadata, so we need to do the + * btrfs_can_overcommit check here, and we need to pass in + * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of + * leeway to allow us to mark this block group as read only. + */ + if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes, + BTRFS_RESERVE_NO_FLUSH)) + ret = 0; + } + + if (!ret) { + sinfo->bytes_readonly += num_bytes; + if (btrfs_is_zoned(cache->fs_info)) { + /* Migrate zone_unusable bytes to readonly */ + sinfo->bytes_readonly += cache->zone_unusable; + sinfo->bytes_zone_unusable -= cache->zone_unusable; + cache->zone_unusable = 0; + } + cache->ro++; + list_add_tail(&cache->ro_list, &sinfo->ro_bgs); + } +out: + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); + if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { + btrfs_info(cache->fs_info, + "unable to make block group %llu ro", cache->start); + btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); + } + return ret; +} + +static bool clean_pinned_extents(struct btrfs_trans_handle *trans, + struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_transaction *prev_trans = NULL; + const u64 start = bg->start; + const u64 end = start + bg->length - 1; + int ret; + + spin_lock(&fs_info->trans_lock); + if (trans->transaction->list.prev != &fs_info->trans_list) { + prev_trans = list_last_entry(&trans->transaction->list, + struct btrfs_transaction, list); + refcount_inc(&prev_trans->use_count); + } + spin_unlock(&fs_info->trans_lock); + + /* + * Hold the unused_bg_unpin_mutex lock to avoid racing with + * btrfs_finish_extent_commit(). If we are at transaction N, another + * task might be running finish_extent_commit() for the previous + * transaction N - 1, and have seen a range belonging to the block + * group in pinned_extents before we were able to clear the whole block + * group range from pinned_extents. This means that task can lookup for + * the block group after we unpinned it from pinned_extents and removed + * it, leading to a BUG_ON() at unpin_extent_range(). + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); + if (prev_trans) { + ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, + EXTENT_DIRTY); + if (ret) + goto out; + } + + ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, + EXTENT_DIRTY); +out: + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + if (prev_trans) + btrfs_put_transaction(prev_trans); + + return ret == 0; +} + +/* + * Process the unused_bgs list and remove any that don't have any allocated + * space inside of them. + */ +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group *block_group; + struct btrfs_space_info *space_info; + struct btrfs_trans_handle *trans; + const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC); + int ret = 0; + + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) + return; + + if (btrfs_fs_closing(fs_info)) + return; + + /* + * Long running balances can keep us blocked here for eternity, so + * simply skip deletion if we're unable to get the mutex. + */ + if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) + return; + + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->unused_bgs)) { + int trimming; + + block_group = list_first_entry(&fs_info->unused_bgs, + struct btrfs_block_group, + bg_list); + list_del_init(&block_group->bg_list); + + space_info = block_group->space_info; + + if (ret || btrfs_mixed_space_info(space_info)) { + btrfs_put_block_group(block_group); + continue; + } + spin_unlock(&fs_info->unused_bgs_lock); + + btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); + + /* Don't want to race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + + /* + * Async discard moves the final block group discard to be prior + * to the unused_bgs code path. Therefore, if it's not fully + * trimmed, punt it back to the async discard lists. + */ + if (btrfs_test_opt(fs_info, DISCARD_ASYNC) && + !btrfs_is_free_space_trimmed(block_group)) { + trace_btrfs_skip_unused_block_group(block_group); + up_write(&space_info->groups_sem); + /* Requeue if we failed because of async discard */ + btrfs_discard_queue_work(&fs_info->discard_ctl, + block_group); + goto next; + } + + spin_lock(&block_group->lock); + if (block_group->reserved || block_group->pinned || + block_group->used || block_group->ro || + list_is_singular(&block_group->list)) { + /* + * We want to bail if we made new allocations or have + * outstanding allocations in this block group. We do + * the ro check in case balance is currently acting on + * this block group. + */ + trace_btrfs_skip_unused_block_group(block_group); + spin_unlock(&block_group->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&block_group->lock); + + /* We don't want to force the issue, only flip if it's ok. */ + ret = inc_block_group_ro(block_group, 0); + up_write(&space_info->groups_sem); + if (ret < 0) { + ret = 0; + goto next; + } + + ret = btrfs_zone_finish(block_group); + if (ret < 0) { + btrfs_dec_block_group_ro(block_group); + if (ret == -EAGAIN) + ret = 0; + goto next; + } + + /* + * Want to do this before we do anything else so we can recover + * properly if we fail to join the transaction. + */ + trans = btrfs_start_trans_remove_block_group(fs_info, + block_group->start); + if (IS_ERR(trans)) { + btrfs_dec_block_group_ro(block_group); + ret = PTR_ERR(trans); + goto next; + } + + /* + * We could have pending pinned extents for this block group, + * just delete them, we don't care about them anymore. + */ + if (!clean_pinned_extents(trans, block_group)) { + btrfs_dec_block_group_ro(block_group); + goto end_trans; + } + + /* + * At this point, the block_group is read only and should fail + * new allocations. However, btrfs_finish_extent_commit() can + * cause this block_group to be placed back on the discard + * lists because now the block_group isn't fully discarded. + * Bail here and try again later after discarding everything. + */ + spin_lock(&fs_info->discard_ctl.lock); + if (!list_empty(&block_group->discard_list)) { + spin_unlock(&fs_info->discard_ctl.lock); + btrfs_dec_block_group_ro(block_group); + btrfs_discard_queue_work(&fs_info->discard_ctl, + block_group); + goto end_trans; + } + spin_unlock(&fs_info->discard_ctl.lock); + + /* Reset pinned so btrfs_put_block_group doesn't complain */ + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + + btrfs_space_info_update_bytes_pinned(fs_info, space_info, + -block_group->pinned); + space_info->bytes_readonly += block_group->pinned; + block_group->pinned = 0; + + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + + /* + * The normal path here is an unused block group is passed here, + * then trimming is handled in the transaction commit path. + * Async discard interposes before this to do the trimming + * before coming down the unused block group path as trimming + * will no longer be done later in the transaction commit path. + */ + if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC)) + goto flip_async; + + /* + * DISCARD can flip during remount. On zoned filesystems, we + * need to reset sequential-required zones. + */ + trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_is_zoned(fs_info); + + /* Implicit trim during transaction commit. */ + if (trimming) + btrfs_freeze_block_group(block_group); + + /* + * Btrfs_remove_chunk will abort the transaction if things go + * horribly wrong. + */ + ret = btrfs_remove_chunk(trans, block_group->start); + + if (ret) { + if (trimming) + btrfs_unfreeze_block_group(block_group); + goto end_trans; + } + + /* + * If we're not mounted with -odiscard, we can just forget + * about this block group. Otherwise we'll need to wait + * until transaction commit to do the actual discard. + */ + if (trimming) { + spin_lock(&fs_info->unused_bgs_lock); + /* + * A concurrent scrub might have added us to the list + * fs_info->unused_bgs, so use a list_move operation + * to add the block group to the deleted_bgs list. + */ + list_move(&block_group->bg_list, + &trans->transaction->deleted_bgs); + spin_unlock(&fs_info->unused_bgs_lock); + btrfs_get_block_group(block_group); + } +end_trans: + btrfs_end_transaction(trans); +next: + btrfs_put_block_group(block_group); + spin_lock(&fs_info->unused_bgs_lock); + } + spin_unlock(&fs_info->unused_bgs_lock); + mutex_unlock(&fs_info->reclaim_bgs_lock); + return; + +flip_async: + btrfs_end_transaction(trans); + mutex_unlock(&fs_info->reclaim_bgs_lock); + btrfs_put_block_group(block_group); + btrfs_discard_punt_unused_bgs_list(fs_info); +} + +void btrfs_mark_bg_unused(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + spin_lock(&fs_info->unused_bgs_lock); + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + trace_btrfs_add_unused_block_group(bg); + list_add_tail(&bg->bg_list, &fs_info->unused_bgs); + } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) { + /* Pull out the block group from the reclaim_bgs list. */ + trace_btrfs_add_unused_block_group(bg); + list_move_tail(&bg->bg_list, &fs_info->unused_bgs); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + +/* + * We want block groups with a low number of used bytes to be in the beginning + * of the list, so they will get reclaimed first. + */ +static int reclaim_bgs_cmp(void *unused, const struct list_head *a, + const struct list_head *b) +{ + const struct btrfs_block_group *bg1, *bg2; + + bg1 = list_entry(a, struct btrfs_block_group, bg_list); + bg2 = list_entry(b, struct btrfs_block_group, bg_list); + + return bg1->used > bg2->used; +} + +static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) +{ + if (btrfs_is_zoned(fs_info)) + return btrfs_zoned_should_reclaim(fs_info); + return true; +} + +static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) +{ + const struct btrfs_space_info *space_info = bg->space_info; + const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); + const u64 new_val = bg->used; + const u64 old_val = new_val + bytes_freed; + u64 thresh; + + if (reclaim_thresh == 0) + return false; + + thresh = mult_perc(bg->length, reclaim_thresh); + + /* + * If we were below the threshold before don't reclaim, we are likely a + * brand new block group and we don't want to relocate new block groups. + */ + if (old_val < thresh) + return false; + if (new_val >= thresh) + return false; + return true; +} + +void btrfs_reclaim_bgs_work(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info = + container_of(work, struct btrfs_fs_info, reclaim_bgs_work); + struct btrfs_block_group *bg; + struct btrfs_space_info *space_info; + + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) + return; + + if (btrfs_fs_closing(fs_info)) + return; + + if (!btrfs_should_reclaim(fs_info)) + return; + + sb_start_write(fs_info->sb); + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + sb_end_write(fs_info->sb); + return; + } + + /* + * Long running balances can keep us blocked here for eternity, so + * simply skip reclaim if we're unable to get the mutex. + */ + if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { + btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); + return; + } + + spin_lock(&fs_info->unused_bgs_lock); + /* + * Sort happens under lock because we can't simply splice it and sort. + * The block groups might still be in use and reachable via bg_list, + * and their presence in the reclaim_bgs list must be preserved. + */ + list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); + while (!list_empty(&fs_info->reclaim_bgs)) { + u64 zone_unusable; + int ret = 0; + + bg = list_first_entry(&fs_info->reclaim_bgs, + struct btrfs_block_group, + bg_list); + list_del_init(&bg->bg_list); + + space_info = bg->space_info; + spin_unlock(&fs_info->unused_bgs_lock); + + /* Don't race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + + spin_lock(&bg->lock); + if (bg->reserved || bg->pinned || bg->ro) { + /* + * We want to bail if we made new allocations or have + * outstanding allocations in this block group. We do + * the ro check in case balance is currently acting on + * this block group. + */ + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; + } + if (bg->used == 0) { + /* + * It is possible that we trigger relocation on a block + * group as its extents are deleted and it first goes + * below the threshold, then shortly after goes empty. + * + * In this case, relocating it does delete it, but has + * some overhead in relocation specific metadata, looking + * for the non-existent extents and running some extra + * transactions, which we can avoid by using one of the + * other mechanisms for dealing with empty block groups. + */ + if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_mark_bg_unused(bg); + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; + + } + /* + * The block group might no longer meet the reclaim condition by + * the time we get around to reclaiming it, so to avoid + * reclaiming overly full block_groups, skip reclaiming them. + * + * Since the decision making process also depends on the amount + * being freed, pass in a fake giant value to skip that extra + * check, which is more meaningful when adding to the list in + * the first place. + */ + if (!should_reclaim_block_group(bg, bg->length)) { + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&bg->lock); + + /* + * Get out fast, in case we're read-only or unmounting the + * filesystem. It is OK to drop block groups from the list even + * for the read-only case. As we did sb_start_write(), + * "mount -o remount,ro" won't happen and read-only filesystem + * means it is forced read-only due to a fatal error. So, it + * never gets back to read-write to let us reclaim again. + */ + if (btrfs_need_cleaner_sleep(fs_info)) { + up_write(&space_info->groups_sem); + goto next; + } + + /* + * Cache the zone_unusable value before turning the block group + * to read only. As soon as the blog group is read only it's + * zone_unusable value gets moved to the block group's read-only + * bytes and isn't available for calculations anymore. + */ + zone_unusable = bg->zone_unusable; + ret = inc_block_group_ro(bg, 0); + up_write(&space_info->groups_sem); + if (ret < 0) + goto next; + + btrfs_info(fs_info, + "reclaiming chunk %llu with %llu%% used %llu%% unusable", + bg->start, + div64_u64(bg->used * 100, bg->length), + div64_u64(zone_unusable * 100, bg->length)); + trace_btrfs_reclaim_block_group(bg); + ret = btrfs_relocate_chunk(fs_info, bg->start); + if (ret) { + btrfs_dec_block_group_ro(bg); + btrfs_err(fs_info, "error relocating chunk %llu", + bg->start); + } + +next: + if (ret) + btrfs_mark_bg_to_reclaim(bg); + btrfs_put_block_group(bg); + + mutex_unlock(&fs_info->reclaim_bgs_lock); + /* + * Reclaiming all the block groups in the list can take really + * long. Prioritize cleaning up unused block groups. + */ + btrfs_delete_unused_bgs(fs_info); + /* + * If we are interrupted by a balance, we can just bail out. The + * cleaner thread restart again if necessary. + */ + if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) + goto end; + spin_lock(&fs_info->unused_bgs_lock); + } + spin_unlock(&fs_info->unused_bgs_lock); + mutex_unlock(&fs_info->reclaim_bgs_lock); +end: + btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); +} + +void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) +{ + spin_lock(&fs_info->unused_bgs_lock); + if (!list_empty(&fs_info->reclaim_bgs)) + queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); + spin_unlock(&fs_info->unused_bgs_lock); +} + +void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + spin_lock(&fs_info->unused_bgs_lock); + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + trace_btrfs_add_reclaim_block_group(bg); + list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + +static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, + struct btrfs_path *path) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_block_group_item bg; + struct extent_buffer *leaf; + int slot; + u64 flags; + int ret = 0; + + slot = path->slots[0]; + leaf = path->nodes[0]; + + em_tree = &fs_info->mapping_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, key->objectid, key->offset); + read_unlock(&em_tree->lock); + if (!em) { + btrfs_err(fs_info, + "logical %llu len %llu found bg but no related chunk", + key->objectid, key->offset); + return -ENOENT; + } + + if (em->start != key->objectid || em->len != key->offset) { + btrfs_err(fs_info, + "block group %llu len %llu mismatch with chunk %llu len %llu", + key->objectid, key->offset, em->start, em->len); + ret = -EUCLEAN; + goto out_free_em; + } + + read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot), + sizeof(bg)); + flags = btrfs_stack_block_group_flags(&bg) & + BTRFS_BLOCK_GROUP_TYPE_MASK; + + if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(fs_info, +"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", + key->objectid, key->offset, flags, + (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type)); + ret = -EUCLEAN; + } + +out_free_em: + free_extent_map(em); + return ret; +} + +static int find_first_block_group(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct btrfs_key *key) +{ + struct btrfs_root *root = btrfs_block_group_root(fs_info); + int ret; + struct btrfs_key found_key; + + btrfs_for_each_slot(root, key, &found_key, path, ret) { + if (found_key.objectid >= key->objectid && + found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { + return read_bg_from_eb(fs_info, &found_key, path); + } + } + return ret; +} + +static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; + + write_seqlock(&fs_info->profiles_lock); + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits |= extra_flags; + write_sequnlock(&fs_info->profiles_lock); +} + +/* + * Map a physical disk address to a list of logical addresses. + * + * @fs_info: the filesystem + * @chunk_start: logical address of block group + * @physical: physical address to map to logical addresses + * @logical: return array of logical addresses which map to @physical + * @naddrs: length of @logical + * @stripe_len: size of IO stripe for the given block group + * + * Maps a particular @physical disk address to a list of @logical addresses. + * Used primarily to exclude those portions of a block group that contain super + * block copies. + */ +int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + u64 physical, u64 **logical, int *naddrs, int *stripe_len) +{ + struct extent_map *em; + struct map_lookup *map; + u64 *buf; + u64 bytenr; + u64 data_stripe_length; + u64 io_stripe_size; + int i, nr = 0; + int ret = 0; + + em = btrfs_get_chunk_map(fs_info, chunk_start, 1); + if (IS_ERR(em)) + return -EIO; + + map = em->map_lookup; + data_stripe_length = em->orig_block_len; + io_stripe_size = BTRFS_STRIPE_LEN; + chunk_start = em->start; + + /* For RAID5/6 adjust to a full IO stripe length */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) + io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); + + buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); + if (!buf) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < map->num_stripes; i++) { + bool already_inserted = false; + u32 stripe_nr; + u32 offset; + int j; + + if (!in_range(physical, map->stripes[i].physical, + data_stripe_length)) + continue; + + stripe_nr = (physical - map->stripes[i].physical) >> + BTRFS_STRIPE_LEN_SHIFT; + offset = (physical - map->stripes[i].physical) & + BTRFS_STRIPE_LEN_MASK; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) + stripe_nr = div_u64(stripe_nr * map->num_stripes + i, + map->sub_stripes); + /* + * The remaining case would be for RAID56, multiply by + * nr_data_stripes(). Alternatively, just use rmap_len below + * instead of map->stripe_len + */ + bytenr = chunk_start + stripe_nr * io_stripe_size + offset; + + /* Ensure we don't add duplicate addresses */ + for (j = 0; j < nr; j++) { + if (buf[j] == bytenr) { + already_inserted = true; + break; + } + } + + if (!already_inserted) + buf[nr++] = bytenr; + } + + *logical = buf; + *naddrs = nr; + *stripe_len = io_stripe_size; +out: + free_extent_map(em); + return ret; +} + +static int exclude_super_stripes(struct btrfs_block_group *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + const bool zoned = btrfs_is_zoned(fs_info); + u64 bytenr; + u64 *logical; + int stripe_len; + int i, nr, ret; + + if (cache->start < BTRFS_SUPER_INFO_OFFSET) { + stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; + cache->bytes_super += stripe_len; + ret = set_extent_bit(&fs_info->excluded_extents, cache->start, + cache->start + stripe_len - 1, + EXTENT_UPTODATE, NULL); + if (ret) + return ret; + } + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + ret = btrfs_rmap_block(fs_info, cache->start, + bytenr, &logical, &nr, &stripe_len); + if (ret) + return ret; + + /* Shouldn't have super stripes in sequential zones */ + if (zoned && nr) { + kfree(logical); + btrfs_err(fs_info, + "zoned: block group %llu must not contain super block", + cache->start); + return -EUCLEAN; + } + + while (nr--) { + u64 len = min_t(u64, stripe_len, + cache->start + cache->length - logical[nr]); + + cache->bytes_super += len; + ret = set_extent_bit(&fs_info->excluded_extents, logical[nr], + logical[nr] + len - 1, + EXTENT_UPTODATE, NULL); + if (ret) { + kfree(logical); + return ret; + } + } + + kfree(logical); + } + return 0; +} + +static struct btrfs_block_group *btrfs_create_block_group_cache( + struct btrfs_fs_info *fs_info, u64 start) +{ + struct btrfs_block_group *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->start = start; + + cache->fs_info = fs_info; + cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); + + cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED; + + refcount_set(&cache->refs, 1); + spin_lock_init(&cache->lock); + init_rwsem(&cache->data_rwsem); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->bg_list); + INIT_LIST_HEAD(&cache->ro_list); + INIT_LIST_HEAD(&cache->discard_list); + INIT_LIST_HEAD(&cache->dirty_list); + INIT_LIST_HEAD(&cache->io_list); + INIT_LIST_HEAD(&cache->active_bg_list); + btrfs_init_free_space_ctl(cache, cache->free_space_ctl); + atomic_set(&cache->frozen, 0); + mutex_init(&cache->free_space_lock); + + return cache; +} + +/* + * Iterate all chunks and verify that each of them has the corresponding block + * group + */ +static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) +{ + struct extent_map_tree *map_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct btrfs_block_group *bg; + u64 start = 0; + int ret = 0; + + while (1) { + read_lock(&map_tree->lock); + /* + * lookup_extent_mapping will return the first extent map + * intersecting the range, so setting @len to 1 is enough to + * get the first chunk. + */ + em = lookup_extent_mapping(map_tree, start, 1); + read_unlock(&map_tree->lock); + if (!em) + break; + + bg = btrfs_lookup_block_group(fs_info, em->start); + if (!bg) { + btrfs_err(fs_info, + "chunk start=%llu len=%llu doesn't have corresponding block group", + em->start, em->len); + ret = -EUCLEAN; + free_extent_map(em); + break; + } + if (bg->start != em->start || bg->length != em->len || + (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != + (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(fs_info, +"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", + em->start, em->len, + em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, + bg->start, bg->length, + bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); + ret = -EUCLEAN; + free_extent_map(em); + btrfs_put_block_group(bg); + break; + } + start = em->start + em->len; + free_extent_map(em); + btrfs_put_block_group(bg); + } + return ret; +} + +static int read_one_block_group(struct btrfs_fs_info *info, + struct btrfs_block_group_item *bgi, + const struct btrfs_key *key, + int need_clear) +{ + struct btrfs_block_group *cache; + const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS); + int ret; + + ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY); + + cache = btrfs_create_block_group_cache(info, key->objectid); + if (!cache) + return -ENOMEM; + + cache->length = key->offset; + cache->used = btrfs_stack_block_group_used(bgi); + cache->commit_used = cache->used; + cache->flags = btrfs_stack_block_group_flags(bgi); + cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); + + set_free_space_tree_thresholds(cache); + + if (need_clear) { + /* + * When we mount with old space cache, we need to + * set BTRFS_DC_CLEAR and set dirty flag. + * + * a) Setting 'BTRFS_DC_CLEAR' makes sure that we + * truncate the old free space cache inode and + * setup a new one. + * b) Setting 'dirty flag' makes sure that we flush + * the new space cache info onto disk. + */ + if (btrfs_test_opt(info, SPACE_CACHE)) + cache->disk_cache_state = BTRFS_DC_CLEAR; + } + if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && + (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { + btrfs_err(info, +"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", + cache->start); + ret = -EINVAL; + goto error; + } + + ret = btrfs_load_block_group_zone_info(cache, false); + if (ret) { + btrfs_err(info, "zoned: failed to load zone info of bg %llu", + cache->start); + goto error; + } + + /* + * We need to exclude the super stripes now so that the space info has + * super bytes accounted for, otherwise we'll think we have more space + * than we actually do. + */ + ret = exclude_super_stripes(cache); + if (ret) { + /* We may have excluded something, so call this just in case. */ + btrfs_free_excluded_extents(cache); + goto error; + } + + /* + * For zoned filesystem, space after the allocation offset is the only + * free space for a block group. So, we don't need any caching work. + * btrfs_calc_zone_unusable() will set the amount of free space and + * zone_unusable space. + * + * For regular filesystem, check for two cases, either we are full, and + * therefore don't need to bother with the caching work since we won't + * find any space, or we are empty, and we can just add all the space + * in and be done with it. This saves us _a_lot_ of time, particularly + * in the full case. + */ + if (btrfs_is_zoned(info)) { + btrfs_calc_zone_unusable(cache); + /* Should not have any excluded extents. Just in case, though. */ + btrfs_free_excluded_extents(cache); + } else if (cache->length == cache->used) { + cache->cached = BTRFS_CACHE_FINISHED; + btrfs_free_excluded_extents(cache); + } else if (cache->used == 0) { + cache->cached = BTRFS_CACHE_FINISHED; + ret = btrfs_add_new_free_space(cache, cache->start, + cache->start + cache->length, NULL); + btrfs_free_excluded_extents(cache); + if (ret) + goto error; + } + + ret = btrfs_add_block_group_cache(info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + goto error; + } + trace_btrfs_add_block_group(info, cache, 0); + btrfs_add_bg_to_space_info(info, cache); + + set_avail_alloc_bits(info, cache->flags); + if (btrfs_chunk_writeable(info, cache->start)) { + if (cache->used == 0) { + ASSERT(list_empty(&cache->bg_list)); + if (btrfs_test_opt(info, DISCARD_ASYNC)) + btrfs_discard_queue_work(&info->discard_ctl, cache); + else + btrfs_mark_bg_unused(cache); + } + } else { + inc_block_group_ro(cache, 1); + } + + return 0; +error: + btrfs_put_block_group(cache); + return ret; +} + +static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct rb_node *node; + int ret = 0; + + for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { + struct extent_map *em; + struct map_lookup *map; + struct btrfs_block_group *bg; + + em = rb_entry(node, struct extent_map, rb_node); + map = em->map_lookup; + bg = btrfs_create_block_group_cache(fs_info, em->start); + if (!bg) { + ret = -ENOMEM; + break; + } + + /* Fill dummy cache as FULL */ + bg->length = em->len; + bg->flags = map->type; + bg->cached = BTRFS_CACHE_FINISHED; + bg->used = em->len; + bg->flags = map->type; + ret = btrfs_add_block_group_cache(fs_info, bg); + /* + * We may have some valid block group cache added already, in + * that case we skip to the next one. + */ + if (ret == -EEXIST) { + ret = 0; + btrfs_put_block_group(bg); + continue; + } + + if (ret) { + btrfs_remove_free_space_cache(bg); + btrfs_put_block_group(bg); + break; + } + + btrfs_add_bg_to_space_info(fs_info, bg); + + set_avail_alloc_bits(fs_info, bg->flags); + } + if (!ret) + btrfs_init_global_block_rsv(fs_info); + return ret; +} + +int btrfs_read_block_groups(struct btrfs_fs_info *info) +{ + struct btrfs_root *root = btrfs_block_group_root(info); + struct btrfs_path *path; + int ret; + struct btrfs_block_group *cache; + struct btrfs_space_info *space_info; + struct btrfs_key key; + int need_clear = 0; + u64 cache_gen; + + /* + * Either no extent root (with ibadroots rescue option) or we have + * unsupported RO options. The fs can never be mounted read-write, so no + * need to waste time searching block group items. + * + * This also allows new extent tree related changes to be RO compat, + * no need for a full incompat flag. + */ + if (!root || (btrfs_super_compat_ro_flags(info->super_copy) & + ~BTRFS_FEATURE_COMPAT_RO_SUPP)) + return fill_dummy_bgs(info); + + key.objectid = 0; + key.offset = 0; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + cache_gen = btrfs_super_cache_generation(info->super_copy); + if (btrfs_test_opt(info, SPACE_CACHE) && + btrfs_super_generation(info->super_copy) != cache_gen) + need_clear = 1; + if (btrfs_test_opt(info, CLEAR_CACHE)) + need_clear = 1; + + while (1) { + struct btrfs_block_group_item bgi; + struct extent_buffer *leaf; + int slot; + + ret = find_first_block_group(info, path, &key); + if (ret > 0) + break; + if (ret != 0) + goto error; + + leaf = path->nodes[0]; + slot = path->slots[0]; + + read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), + sizeof(bgi)); + + btrfs_item_key_to_cpu(leaf, &key, slot); + btrfs_release_path(path); + ret = read_one_block_group(info, &bgi, &key, need_clear); + if (ret < 0) + goto error; + key.objectid += key.offset; + key.offset = 0; + } + btrfs_release_path(path); + + list_for_each_entry(space_info, &info->space_info, list) { + int i; + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (list_empty(&space_info->block_groups[i])) + continue; + cache = list_first_entry(&space_info->block_groups[i], + struct btrfs_block_group, + list); + btrfs_sysfs_add_block_group_type(cache); + } + + if (!(btrfs_get_alloc_profile(info, space_info->flags) & + (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID1_MASK | + BTRFS_BLOCK_GROUP_RAID56_MASK | + BTRFS_BLOCK_GROUP_DUP))) + continue; + /* + * Avoid allocating from un-mirrored block group if there are + * mirrored block groups. + */ + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_RAID0], + list) + inc_block_group_ro(cache, 1); + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_SINGLE], + list) + inc_block_group_ro(cache, 1); + } + + btrfs_init_global_block_rsv(info); + ret = check_chunk_block_group_mappings(info); +error: + btrfs_free_path(path); + /* + * We've hit some error while reading the extent tree, and have + * rescue=ibadroots mount option. + * Try to fill the tree using dummy block groups so that the user can + * continue to mount and grab their data. + */ + if (ret && btrfs_test_opt(info, IGNOREBADROOTS)) + ret = fill_dummy_bgs(info); + return ret; +} + +/* + * This function, insert_block_group_item(), belongs to the phase 2 of chunk + * allocation. + * + * See the comment at btrfs_chunk_alloc() for details about the chunk allocation + * phases. + */ +static int insert_block_group_item(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group_item bgi; + struct btrfs_root *root = btrfs_block_group_root(fs_info); + struct btrfs_key key; + u64 old_commit_used; + int ret; + + spin_lock(&block_group->lock); + btrfs_set_stack_block_group_used(&bgi, block_group->used); + btrfs_set_stack_block_group_chunk_objectid(&bgi, + block_group->global_root_id); + btrfs_set_stack_block_group_flags(&bgi, block_group->flags); + old_commit_used = block_group->commit_used; + block_group->commit_used = block_group->used; + key.objectid = block_group->start; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = block_group->length; + spin_unlock(&block_group->lock); + + ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); + if (ret < 0) { + spin_lock(&block_group->lock); + block_group->commit_used = old_commit_used; + spin_unlock(&block_group->lock); + } + + return ret; +} + +static int insert_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 chunk_offset, + u64 start, u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = device->fs_info; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_path *path; + struct btrfs_dev_extent *extent; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret; + + WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); + WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = start; + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); + if (ret) + goto out; + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); + btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID); + btrfs_set_dev_extent_chunk_objectid(leaf, extent, + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); + + btrfs_set_dev_extent_length(leaf, extent, num_bytes); + btrfs_mark_buffer_dirty(trans, leaf); +out: + btrfs_free_path(path); + return ret; +} + +/* + * This function belongs to phase 2. + * + * See the comment at btrfs_chunk_alloc() for details about the chunk allocation + * phases. + */ +static int insert_dev_extents(struct btrfs_trans_handle *trans, + u64 chunk_offset, u64 chunk_size) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_device *device; + struct extent_map *em; + struct map_lookup *map; + u64 dev_offset; + u64 stripe_size; + int i; + int ret = 0; + + em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + stripe_size = em->orig_block_len; + + /* + * Take the device list mutex to prevent races with the final phase of + * a device replace operation that replaces the device object associated + * with the map's stripes, because the device object's id can change + * at any time during that final phase of the device replace operation + * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the + * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, + * resulting in persisting a device extent item with such ID. + */ + mutex_lock(&fs_info->fs_devices->device_list_mutex); + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + dev_offset = map->stripes[i].physical; + + ret = insert_dev_extent(trans, device, chunk_offset, dev_offset, + stripe_size); + if (ret) + break; + } + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + free_extent_map(em); + return ret; +} + +/* + * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of + * chunk allocation. + * + * See the comment at btrfs_chunk_alloc() for details about the chunk allocation + * phases. + */ +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group *block_group; + int ret = 0; + + while (!list_empty(&trans->new_bgs)) { + int index; + + block_group = list_first_entry(&trans->new_bgs, + struct btrfs_block_group, + bg_list); + if (ret) + goto next; + + index = btrfs_bg_flags_to_raid_index(block_group->flags); + + ret = insert_block_group_item(trans, block_group); + if (ret) + btrfs_abort_transaction(trans, ret); + if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, + &block_group->runtime_flags)) { + mutex_lock(&fs_info->chunk_mutex); + ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group); + mutex_unlock(&fs_info->chunk_mutex); + if (ret) + btrfs_abort_transaction(trans, ret); + } + ret = insert_dev_extents(trans, block_group->start, + block_group->length); + if (ret) + btrfs_abort_transaction(trans, ret); + add_block_group_free_space(trans, block_group); + + /* + * If we restriped during balance, we may have added a new raid + * type, so now add the sysfs entries when it is safe to do so. + * We don't have to worry about locking here as it's handled in + * btrfs_sysfs_add_block_group_type. + */ + if (block_group->space_info->block_group_kobjs[index] == NULL) + btrfs_sysfs_add_block_group_type(block_group); + + /* Already aborted the transaction if it failed. */ +next: + btrfs_delayed_refs_rsv_release(fs_info, 1); + list_del_init(&block_group->bg_list); + clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); + } + btrfs_trans_release_chunk_metadata(trans); +} + +/* + * For extent tree v2 we use the block_group_item->chunk_offset to point at our + * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. + */ +static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) +{ + u64 div = SZ_1G; + u64 index; + + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return BTRFS_FIRST_CHUNK_TREE_OBJECTID; + + /* If we have a smaller fs index based on 128MiB. */ + if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL)) + div = SZ_128M; + + offset = div64_u64(offset, div); + div64_u64_rem(offset, fs_info->nr_global_roots, &index); + return index; +} + +struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, + u64 type, + u64 chunk_offset, u64 size) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group *cache; + int ret; + + btrfs_set_log_full_commit(trans); + + cache = btrfs_create_block_group_cache(fs_info, chunk_offset); + if (!cache) + return ERR_PTR(-ENOMEM); + + /* + * Mark it as new before adding it to the rbtree of block groups or any + * list, so that no other task finds it and calls btrfs_mark_bg_unused() + * before the new flag is set. + */ + set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags); + + cache->length = size; + set_free_space_tree_thresholds(cache); + cache->flags = type; + cache->cached = BTRFS_CACHE_FINISHED; + cache->global_root_id = calculate_global_root_id(fs_info, cache->start); + + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags); + + ret = btrfs_load_block_group_zone_info(cache, true); + if (ret) { + btrfs_put_block_group(cache); + return ERR_PTR(ret); + } + + ret = exclude_super_stripes(cache); + if (ret) { + /* We may have excluded something, so call this just in case */ + btrfs_free_excluded_extents(cache); + btrfs_put_block_group(cache); + return ERR_PTR(ret); + } + + ret = btrfs_add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL); + btrfs_free_excluded_extents(cache); + if (ret) { + btrfs_put_block_group(cache); + return ERR_PTR(ret); + } + + /* + * Ensure the corresponding space_info object is created and + * assigned to our block group. We want our bg to be added to the rbtree + * with its ->space_info set. + */ + cache->space_info = btrfs_find_space_info(fs_info, cache->flags); + ASSERT(cache->space_info); + + ret = btrfs_add_block_group_cache(fs_info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ERR_PTR(ret); + } + + /* + * Now that our block group has its ->space_info set and is inserted in + * the rbtree, update the space info's counters. + */ + trace_btrfs_add_block_group(fs_info, cache, 1); + btrfs_add_bg_to_space_info(fs_info, cache); + btrfs_update_global_block_rsv(fs_info); + +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(cache)) { + cache->space_info->bytes_used += size >> 1; + fragment_free_space(cache); + } +#endif + + list_add_tail(&cache->bg_list, &trans->new_bgs); + trans->delayed_ref_updates++; + btrfs_update_delayed_refs_rsv(trans); + + set_avail_alloc_bits(fs_info, type); + return cache; +} + +/* + * Mark one block group RO, can be called several times for the same block + * group. + * + * @cache: the destination block group + * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to + * ensure we still have some free space after marking this + * block group RO. + */ +int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, + bool do_chunk_alloc) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = btrfs_block_group_root(fs_info); + u64 alloc_flags; + int ret; + bool dirty_bg_running; + + /* + * This can only happen when we are doing read-only scrub on read-only + * mount. + * In that case we should not start a new transaction on read-only fs. + * Thus here we skip all chunk allocations. + */ + if (sb_rdonly(fs_info->sb)) { + mutex_lock(&fs_info->ro_block_group_mutex); + ret = inc_block_group_ro(cache, 0); + mutex_unlock(&fs_info->ro_block_group_mutex); + return ret; + } + + do { + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + dirty_bg_running = false; + + /* + * We're not allowed to set block groups readonly after the dirty + * block group cache has started writing. If it already started, + * back off and let this transaction commit. + */ + mutex_lock(&fs_info->ro_block_group_mutex); + if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { + u64 transid = trans->transid; + + mutex_unlock(&fs_info->ro_block_group_mutex); + btrfs_end_transaction(trans); + + ret = btrfs_wait_for_commit(fs_info, transid); + if (ret) + return ret; + dirty_bg_running = true; + } + } while (dirty_bg_running); + + if (do_chunk_alloc) { + /* + * If we are changing raid levels, try to allocate a + * corresponding block group with the new raid level. + */ + alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); + if (alloc_flags != cache->flags) { + ret = btrfs_chunk_alloc(trans, alloc_flags, + CHUNK_ALLOC_FORCE); + /* + * ENOSPC is allowed here, we may have enough space + * already allocated at the new raid level to carry on + */ + if (ret == -ENOSPC) + ret = 0; + if (ret < 0) + goto out; + } + } + + ret = inc_block_group_ro(cache, 0); + if (!ret) + goto out; + if (ret == -ETXTBSY) + goto unlock_out; + + /* + * Skip chunk alloction if the bg is SYSTEM, this is to avoid system + * chunk allocation storm to exhaust the system chunk array. Otherwise + * we still want to try our best to mark the block group read-only. + */ + if (!do_chunk_alloc && ret == -ENOSPC && + (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM)) + goto unlock_out; + + alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); + ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + if (ret < 0) + goto out; + /* + * We have allocated a new chunk. We also need to activate that chunk to + * grant metadata tickets for zoned filesystem. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); + if (ret < 0) + goto out; + + ret = inc_block_group_ro(cache, 0); + if (ret == -ETXTBSY) + goto unlock_out; +out: + if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { + alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); + mutex_lock(&fs_info->chunk_mutex); + check_system_chunk(trans, alloc_flags); + mutex_unlock(&fs_info->chunk_mutex); + } +unlock_out: + mutex_unlock(&fs_info->ro_block_group_mutex); + + btrfs_end_transaction(trans); + return ret; +} + +void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) +{ + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + + BUG_ON(!cache->ro); + + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + if (!--cache->ro) { + if (btrfs_is_zoned(cache->fs_info)) { + /* Migrate zone_unusable bytes back */ + cache->zone_unusable = + (cache->alloc_offset - cache->used) + + (cache->length - cache->zone_capacity); + sinfo->bytes_zone_unusable += cache->zone_unusable; + sinfo->bytes_readonly -= cache->zone_unusable; + } + num_bytes = cache->length - cache->reserved - + cache->pinned - cache->bytes_super - + cache->zone_unusable - cache->used; + sinfo->bytes_readonly -= num_bytes; + list_del_init(&cache->ro_list); + } + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); +} + +static int update_block_group_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_block_group *cache) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; + struct btrfs_root *root = btrfs_block_group_root(fs_info); + unsigned long bi; + struct extent_buffer *leaf; + struct btrfs_block_group_item bgi; + struct btrfs_key key; + u64 old_commit_used; + u64 used; + + /* + * Block group items update can be triggered out of commit transaction + * critical section, thus we need a consistent view of used bytes. + * We cannot use cache->used directly outside of the spin lock, as it + * may be changed. + */ + spin_lock(&cache->lock); + old_commit_used = cache->commit_used; + used = cache->used; + /* No change in used bytes, can safely skip it. */ + if (cache->commit_used == used) { + spin_unlock(&cache->lock); + return 0; + } + cache->commit_used = used; + spin_unlock(&cache->lock); + + key.objectid = cache->start; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = cache->length; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto fail; + } + + leaf = path->nodes[0]; + bi = btrfs_item_ptr_offset(leaf, path->slots[0]); + btrfs_set_stack_block_group_used(&bgi, used); + btrfs_set_stack_block_group_chunk_objectid(&bgi, + cache->global_root_id); + btrfs_set_stack_block_group_flags(&bgi, cache->flags); + write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); + btrfs_mark_buffer_dirty(trans, leaf); +fail: + btrfs_release_path(path); + /* + * We didn't update the block group item, need to revert commit_used + * unless the block group item didn't exist yet - this is to prevent a + * race with a concurrent insertion of the block group item, with + * insert_block_group_item(), that happened just after we attempted to + * update. In that case we would reset commit_used to 0 just after the + * insertion set it to a value greater than 0 - if the block group later + * becomes with 0 used bytes, we would incorrectly skip its update. + */ + if (ret < 0 && ret != -ENOENT) { + spin_lock(&cache->lock); + cache->commit_used = old_commit_used; + spin_unlock(&cache->lock); + } + return ret; + +} + +static int cache_save_setup(struct btrfs_block_group *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *root = fs_info->tree_root; + struct inode *inode = NULL; + struct extent_changeset *data_reserved = NULL; + u64 alloc_hint = 0; + int dcs = BTRFS_DC_ERROR; + u64 cache_size = 0; + int retries = 0; + int ret = 0; + + if (!btrfs_test_opt(fs_info, SPACE_CACHE)) + return 0; + + /* + * If this block group is smaller than 100 megs don't bother caching the + * block group. + */ + if (block_group->length < (100 * SZ_1M)) { + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + return 0; + } + + if (TRANS_ABORTED(trans)) + return 0; +again: + inode = lookup_free_space_inode(block_group, path); + if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { + ret = PTR_ERR(inode); + btrfs_release_path(path); + goto out; + } + + if (IS_ERR(inode)) { + BUG_ON(retries); + retries++; + + if (block_group->ro) + goto out_free; + + ret = create_free_space_inode(trans, block_group, path); + if (ret) + goto out_free; + goto again; + } + + /* + * We want to set the generation to 0, that way if anything goes wrong + * from here on out we know not to trust this cache when we load up next + * time. + */ + BTRFS_I(inode)->generation = 0; + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + if (ret) { + /* + * So theoretically we could recover from this, simply set the + * super cache generation to 0 so we know to invalidate the + * cache, but then we'd have to keep track of the block groups + * that fail this way so we know we _have_ to reset this cache + * before the next commit or risk reading stale cache. So to + * limit our exposure to horrible edge cases lets just abort the + * transaction, this only happens in really bad situations + * anyway. + */ + btrfs_abort_transaction(trans, ret); + goto out_put; + } + WARN_ON(ret); + + /* We've already setup this transaction, go ahead and exit */ + if (block_group->cache_generation == trans->transid && + i_size_read(inode)) { + dcs = BTRFS_DC_SETUP; + goto out_put; + } + + if (i_size_read(inode) > 0) { + ret = btrfs_check_trunc_cache_free_space(fs_info, + &fs_info->global_block_rsv); + if (ret) + goto out_put; + + ret = btrfs_truncate_free_space_cache(trans, NULL, inode); + if (ret) + goto out_put; + } + + spin_lock(&block_group->lock); + if (block_group->cached != BTRFS_CACHE_FINISHED || + !btrfs_test_opt(fs_info, SPACE_CACHE)) { + /* + * don't bother trying to write stuff out _if_ + * a) we're not cached, + * b) we're with nospace_cache mount option, + * c) we're with v2 space_cache (FREE_SPACE_TREE). + */ + dcs = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + goto out_put; + } + spin_unlock(&block_group->lock); + + /* + * We hit an ENOSPC when setting up the cache in this transaction, just + * skip doing the setup, we've already cleared the cache so we're safe. + */ + if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { + ret = -ENOSPC; + goto out_put; + } + + /* + * Try to preallocate enough space based on how big the block group is. + * Keep in mind this has to include any pinned space which could end up + * taking up quite a bit since it's not folded into the other space + * cache. + */ + cache_size = div_u64(block_group->length, SZ_256M); + if (!cache_size) + cache_size = 1; + + cache_size *= 16; + cache_size *= fs_info->sectorsize; + + ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0, + cache_size, false); + if (ret) + goto out_put; + + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size, + cache_size, cache_size, + &alloc_hint); + /* + * Our cache requires contiguous chunks so that we don't modify a bunch + * of metadata or split extents when writing the cache out, which means + * we can enospc if we are heavily fragmented in addition to just normal + * out of space conditions. So if we hit this just skip setting up any + * other block groups for this transaction, maybe we'll unpin enough + * space the next time around. + */ + if (!ret) + dcs = BTRFS_DC_SETUP; + else if (ret == -ENOSPC) + set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); + +out_put: + iput(inode); +out_free: + btrfs_release_path(path); +out: + spin_lock(&block_group->lock); + if (!ret && dcs == BTRFS_DC_SETUP) + block_group->cache_generation = trans->transid; + block_group->disk_cache_state = dcs; + spin_unlock(&block_group->lock); + + extent_changeset_free(data_reserved); + return ret; +} + +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group *cache, *tmp; + struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_path *path; + + if (list_empty(&cur_trans->dirty_bgs) || + !btrfs_test_opt(fs_info, SPACE_CACHE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* Could add new block groups, use _safe just in case */ + list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, + dirty_list) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + cache_save_setup(cache, trans, path); + } + + btrfs_free_path(path); + return 0; +} + +/* + * Transaction commit does final block group cache writeback during a critical + * section where nothing is allowed to change the FS. This is required in + * order for the cache to actually match the block group, but can introduce a + * lot of latency into the commit. + * + * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO. + * There's a chance we'll have to redo some of it if the block group changes + * again during the commit, but it greatly reduces the commit latency by + * getting rid of the easy block groups while we're still allowing others to + * join the commit. + */ +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group *cache; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; + int should_put; + struct btrfs_path *path = NULL; + LIST_HEAD(dirty); + struct list_head *io = &cur_trans->io_bgs; + int loops = 0; + + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cur_trans->dirty_bgs)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + return 0; + } + list_splice_init(&cur_trans->dirty_bgs, &dirty); + spin_unlock(&cur_trans->dirty_bgs_lock); + +again: + /* Make sure all the block groups on our dirty list actually exist */ + btrfs_create_pending_block_groups(trans); + + if (!path) { + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + } + + /* + * cache_write_mutex is here only to save us from balance or automatic + * removal of empty block groups deleting this block group while we are + * writing out the cache + */ + mutex_lock(&trans->transaction->cache_write_mutex); + while (!list_empty(&dirty)) { + bool drop_reserve = true; + + cache = list_first_entry(&dirty, struct btrfs_block_group, + dirty_list); + /* + * This can happen if something re-dirties a block group that + * is already under IO. Just wait for it to finish and then do + * it all again + */ + if (!list_empty(&cache->io_list)) { + list_del_init(&cache->io_list); + btrfs_wait_cache_io(trans, cache, path); + btrfs_put_block_group(cache); + } + + + /* + * btrfs_wait_cache_io uses the cache->dirty_list to decide if + * it should update the cache_state. Don't delete until after + * we wait. + * + * Since we're not running in the commit critical section + * we need the dirty_bgs_lock to protect from update_block_group + */ + spin_lock(&cur_trans->dirty_bgs_lock); + list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); + + should_put = 1; + + cache_save_setup(cache, trans, path); + + if (cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + should_put = 0; + + /* + * The cache_write_mutex is protecting the + * io_list, also refer to the definition of + * btrfs_transaction::io_bgs for more details + */ + list_add_tail(&cache->io_list, io); + } else { + /* + * If we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } + if (!ret) { + ret = update_block_group_item(trans, path, cache); + /* + * Our block group might still be attached to the list + * of new block groups in the transaction handle of some + * other task (struct btrfs_trans_handle->new_bgs). This + * means its block group item isn't yet in the extent + * tree. If this happens ignore the error, as we will + * try again later in the critical section of the + * transaction commit. + */ + if (ret == -ENOENT) { + ret = 0; + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &cur_trans->dirty_bgs); + btrfs_get_block_group(cache); + drop_reserve = false; + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } else if (ret) { + btrfs_abort_transaction(trans, ret); + } + } + + /* If it's not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + if (drop_reserve) + btrfs_delayed_refs_rsv_release(fs_info, 1); + /* + * Avoid blocking other tasks for too long. It might even save + * us from writing caches for block groups that are going to be + * removed. + */ + mutex_unlock(&trans->transaction->cache_write_mutex); + if (ret) + goto out; + mutex_lock(&trans->transaction->cache_write_mutex); + } + mutex_unlock(&trans->transaction->cache_write_mutex); + + /* + * Go through delayed refs for all the stuff we've just kicked off + * and then loop back (just once) + */ + if (!ret) + ret = btrfs_run_delayed_refs(trans, 0); + if (!ret && loops == 0) { + loops++; + spin_lock(&cur_trans->dirty_bgs_lock); + list_splice_init(&cur_trans->dirty_bgs, &dirty); + /* + * dirty_bgs_lock protects us from concurrent block group + * deletes too (not just cache_write_mutex). + */ + if (!list_empty(&dirty)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + goto again; + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } +out: + if (ret < 0) { + spin_lock(&cur_trans->dirty_bgs_lock); + list_splice_init(&dirty, &cur_trans->dirty_bgs); + spin_unlock(&cur_trans->dirty_bgs_lock); + btrfs_cleanup_dirty_bgs(cur_trans, fs_info); + } + + btrfs_free_path(path); + return ret; +} + +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group *cache; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; + int should_put; + struct btrfs_path *path; + struct list_head *io = &cur_trans->io_bgs; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Even though we are in the critical section of the transaction commit, + * we can still have concurrent tasks adding elements to this + * transaction's list of dirty block groups. These tasks correspond to + * endio free space workers started when writeback finishes for a + * space cache, which run inode.c:btrfs_finish_ordered_io(), and can + * allocate new block groups as a result of COWing nodes of the root + * tree when updating the free space inode. The writeback for the space + * caches is triggered by an earlier call to + * btrfs_start_dirty_block_groups() and iterations of the following + * loop. + * Also we want to do the cache_save_setup first and then run the + * delayed refs to make sure we have the best chance at doing this all + * in one shot. + */ + spin_lock(&cur_trans->dirty_bgs_lock); + while (!list_empty(&cur_trans->dirty_bgs)) { + cache = list_first_entry(&cur_trans->dirty_bgs, + struct btrfs_block_group, + dirty_list); + + /* + * This can happen if cache_save_setup re-dirties a block group + * that is already under IO. Just wait for it to finish and + * then do it all again + */ + if (!list_empty(&cache->io_list)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + list_del_init(&cache->io_list); + btrfs_wait_cache_io(trans, cache, path); + btrfs_put_block_group(cache); + spin_lock(&cur_trans->dirty_bgs_lock); + } + + /* + * Don't remove from the dirty list until after we've waited on + * any pending IO + */ + list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); + should_put = 1; + + cache_save_setup(cache, trans, path); + + if (!ret) + ret = btrfs_run_delayed_refs(trans, + (unsigned long) -1); + + if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + should_put = 0; + list_add_tail(&cache->io_list, io); + } else { + /* + * If we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } + if (!ret) { + ret = update_block_group_item(trans, path, cache); + /* + * One of the free space endio workers might have + * created a new block group while updating a free space + * cache's inode (at inode.c:btrfs_finish_ordered_io()) + * and hasn't released its transaction handle yet, in + * which case the new block group is still attached to + * its transaction handle and its creation has not + * finished yet (no block group item in the extent tree + * yet, etc). If this is the case, wait for all free + * space endio workers to finish and retry. This is a + * very rare case so no need for a more efficient and + * complex approach. + */ + if (ret == -ENOENT) { + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + ret = update_block_group_item(trans, path, cache); + } + if (ret) + btrfs_abort_transaction(trans, ret); + } + + /* If its not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + btrfs_delayed_refs_rsv_release(fs_info, 1); + spin_lock(&cur_trans->dirty_bgs_lock); + } + spin_unlock(&cur_trans->dirty_bgs_lock); + + /* + * Refer to the definition of io_bgs member for details why it's safe + * to use it without any locking + */ + while (!list_empty(io)) { + cache = list_first_entry(io, struct btrfs_block_group, + io_list); + list_del_init(&cache->io_list); + btrfs_wait_cache_io(trans, cache, path); + btrfs_put_block_group(cache); + } + + btrfs_free_path(path); + return ret; +} + +int btrfs_update_block_group(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, bool alloc) +{ + struct btrfs_fs_info *info = trans->fs_info; + struct btrfs_block_group *cache = NULL; + u64 total = num_bytes; + u64 old_val; + u64 byte_in_group; + int factor; + int ret = 0; + + /* Block accounting for super block */ + spin_lock(&info->delalloc_root_lock); + old_val = btrfs_super_bytes_used(info->super_copy); + if (alloc) + old_val += num_bytes; + else + old_val -= num_bytes; + btrfs_set_super_bytes_used(info->super_copy, old_val); + spin_unlock(&info->delalloc_root_lock); + + while (total) { + struct btrfs_space_info *space_info; + bool reclaim = false; + + cache = btrfs_lookup_block_group(info, bytenr); + if (!cache) { + ret = -ENOENT; + break; + } + space_info = cache->space_info; + factor = btrfs_bg_type_to_factor(cache->flags); + + /* + * If this block group has free space cache written out, we + * need to make sure to load it if we are removing space. This + * is because we need the unpinning stage to actually add the + * space back to the block group, otherwise we will leak space. + */ + if (!alloc && !btrfs_block_group_done(cache)) + btrfs_cache_block_group(cache, true); + + byte_in_group = bytenr - cache->start; + WARN_ON(byte_in_group > cache->length); + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + + if (btrfs_test_opt(info, SPACE_CACHE) && + cache->disk_cache_state < BTRFS_DC_CLEAR) + cache->disk_cache_state = BTRFS_DC_CLEAR; + + old_val = cache->used; + num_bytes = min(total, cache->length - byte_in_group); + if (alloc) { + old_val += num_bytes; + cache->used = old_val; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + space_info->bytes_used += num_bytes; + space_info->disk_used += num_bytes * factor; + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + } else { + old_val -= num_bytes; + cache->used = old_val; + cache->pinned += num_bytes; + btrfs_space_info_update_bytes_pinned(info, space_info, + num_bytes); + space_info->bytes_used -= num_bytes; + space_info->disk_used -= num_bytes * factor; + + reclaim = should_reclaim_block_group(cache, num_bytes); + + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + + set_extent_bit(&trans->transaction->pinned_extents, + bytenr, bytenr + num_bytes - 1, + EXTENT_DIRTY, NULL); + } + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &trans->transaction->dirty_bgs); + trans->delayed_ref_updates++; + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + + /* + * No longer have used bytes in this block group, queue it for + * deletion. We do this after adding the block group to the + * dirty list to avoid races between cleaner kthread and space + * cache writeout. + */ + if (!alloc && old_val == 0) { + if (!btrfs_test_opt(info, DISCARD_ASYNC)) + btrfs_mark_bg_unused(cache); + } else if (!alloc && reclaim) { + btrfs_mark_bg_to_reclaim(cache); + } + + btrfs_put_block_group(cache); + total -= num_bytes; + bytenr += num_bytes; + } + + /* Modified block groups are accounted for in the delayed_refs_rsv. */ + btrfs_update_delayed_refs_rsv(trans); + return ret; +} + +/* + * Update the block_group and space info counters. + * + * @cache: The cache we are manipulating + * @ram_bytes: The number of bytes of file content, and will be same to + * @num_bytes except for the compress path. + * @num_bytes: The number of bytes in question + * @delalloc: The blocks are allocated for the delalloc write + * + * This is called by the allocator when it reserves space. If this is a + * reservation and the block group has become read only we cannot make the + * reservation and return -EAGAIN, otherwise this function always succeeds. + */ +int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, + u64 ram_bytes, u64 num_bytes, int delalloc, + bool force_wrong_size_class) +{ + struct btrfs_space_info *space_info = cache->space_info; + enum btrfs_block_group_size_class size_class; + int ret = 0; + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (cache->ro) { + ret = -EAGAIN; + goto out; + } + + if (btrfs_block_group_should_use_size_class(cache)) { + size_class = btrfs_calc_block_group_size_class(num_bytes); + ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); + if (ret) + goto out; + } + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, "space_info", + space_info->flags, num_bytes, 1); + btrfs_space_info_update_bytes_may_use(cache->fs_info, + space_info, -ram_bytes); + if (delalloc) + cache->delalloc_bytes += num_bytes; + + /* + * Compression can use less space than we reserved, so wake tickets if + * that happens. + */ + if (num_bytes < ram_bytes) + btrfs_try_granting_tickets(cache->fs_info, space_info); +out: + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + return ret; +} + +/* + * Update the block_group and space info counters. + * + * @cache: The cache we are manipulating + * @num_bytes: The number of bytes in question + * @delalloc: The blocks are allocated for the delalloc write + * + * This is called by somebody who is freeing space that was never actually used + * on disk. For example if you reserve some space for a new leaf in transaction + * A and before transaction A commits you free that leaf, you call this with + * reserve set to 0 in order to clear the reservation. + */ +void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, + u64 num_bytes, int delalloc) +{ + struct btrfs_space_info *space_info = cache->space_info; + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + space_info->max_extent_size = 0; + + if (delalloc) + cache->delalloc_bytes -= num_bytes; + spin_unlock(&cache->lock); + + btrfs_try_granting_tickets(cache->fs_info, space_info); + spin_unlock(&space_info->lock); +} + +static void force_metadata_allocation(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + list_for_each_entry(found, head, list) { + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + found->force_alloc = CHUNK_ALLOC_FORCE; + } +} + +static int should_alloc_chunk(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *sinfo, int force) +{ + u64 bytes_used = btrfs_space_info_used(sinfo, false); + u64 thresh; + + if (force == CHUNK_ALLOC_FORCE) + return 1; + + /* + * in limited mode, we want to have some free space up to + * about 1% of the FS size. + */ + if (force == CHUNK_ALLOC_LIMITED) { + thresh = btrfs_super_total_bytes(fs_info->super_copy); + thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1)); + + if (sinfo->total_bytes - bytes_used < thresh) + return 1; + } + + if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80)) + return 0; + return 1; +} + +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) +{ + u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type); + + return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); +} + +static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) +{ + struct btrfs_block_group *bg; + int ret; + + /* + * Check if we have enough space in the system space info because we + * will need to update device items in the chunk btree and insert a new + * chunk item in the chunk btree as well. This will allocate a new + * system block group if needed. + */ + check_system_chunk(trans, flags); + + bg = btrfs_create_chunk(trans, flags); + if (IS_ERR(bg)) { + ret = PTR_ERR(bg); + goto out; + } + + ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); + /* + * Normally we are not expected to fail with -ENOSPC here, since we have + * previously reserved space in the system space_info and allocated one + * new system chunk if necessary. However there are three exceptions: + * + * 1) We may have enough free space in the system space_info but all the + * existing system block groups have a profile which can not be used + * for extent allocation. + * + * This happens when mounting in degraded mode. For example we have a + * RAID1 filesystem with 2 devices, lose one device and mount the fs + * using the other device in degraded mode. If we then allocate a chunk, + * we may have enough free space in the existing system space_info, but + * none of the block groups can be used for extent allocation since they + * have a RAID1 profile, and because we are in degraded mode with a + * single device, we are forced to allocate a new system chunk with a + * SINGLE profile. Making check_system_chunk() iterate over all system + * block groups and check if they have a usable profile and enough space + * can be slow on very large filesystems, so we tolerate the -ENOSPC and + * try again after forcing allocation of a new system chunk. Like this + * we avoid paying the cost of that search in normal circumstances, when + * we were not mounted in degraded mode; + * + * 2) We had enough free space info the system space_info, and one suitable + * block group to allocate from when we called check_system_chunk() + * above. However right after we called it, the only system block group + * with enough free space got turned into RO mode by a running scrub, + * and in this case we have to allocate a new one and retry. We only + * need do this allocate and retry once, since we have a transaction + * handle and scrub uses the commit root to search for block groups; + * + * 3) We had one system block group with enough free space when we called + * check_system_chunk(), but after that, right before we tried to + * allocate the last extent buffer we needed, a discard operation came + * in and it temporarily removed the last free space entry from the + * block group (discard removes a free space entry, discards it, and + * then adds back the entry to the block group cache). + */ + if (ret == -ENOSPC) { + const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info); + struct btrfs_block_group *sys_bg; + + sys_bg = btrfs_create_chunk(trans, sys_flags); + if (IS_ERR(sys_bg)) { + ret = PTR_ERR(sys_bg); + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + } else if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } +out: + btrfs_trans_release_chunk_metadata(trans); + + if (ret) + return ERR_PTR(ret); + + btrfs_get_block_group(bg); + return bg; +} + +/* + * Chunk allocation is done in 2 phases: + * + * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for + * the chunk, the chunk mapping, create its block group and add the items + * that belong in the chunk btree to it - more specifically, we need to + * update device items in the chunk btree and add a new chunk item to it. + * + * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block + * group item to the extent btree and the device extent items to the devices + * btree. + * + * This is done to prevent deadlocks. For example when COWing a node from the + * extent btree we are holding a write lock on the node's parent and if we + * trigger chunk allocation and attempted to insert the new block group item + * in the extent btree right way, we could deadlock because the path for the + * insertion can include that parent node. At first glance it seems impossible + * to trigger chunk allocation after starting a transaction since tasks should + * reserve enough transaction units (metadata space), however while that is true + * most of the time, chunk allocation may still be triggered for several reasons: + * + * 1) When reserving metadata, we check if there is enough free space in the + * metadata space_info and therefore don't trigger allocation of a new chunk. + * However later when the task actually tries to COW an extent buffer from + * the extent btree or from the device btree for example, it is forced to + * allocate a new block group (chunk) because the only one that had enough + * free space was just turned to RO mode by a running scrub for example (or + * device replace, block group reclaim thread, etc), so we can not use it + * for allocating an extent and end up being forced to allocate a new one; + * + * 2) Because we only check that the metadata space_info has enough free bytes, + * we end up not allocating a new metadata chunk in that case. However if + * the filesystem was mounted in degraded mode, none of the existing block + * groups might be suitable for extent allocation due to their incompatible + * profile (for e.g. mounting a 2 devices filesystem, where all block groups + * use a RAID1 profile, in degraded mode using a single device). In this case + * when the task attempts to COW some extent buffer of the extent btree for + * example, it will trigger allocation of a new metadata block group with a + * suitable profile (SINGLE profile in the example of the degraded mount of + * the RAID1 filesystem); + * + * 3) The task has reserved enough transaction units / metadata space, but when + * it attempts to COW an extent buffer from the extent or device btree for + * example, it does not find any free extent in any metadata block group, + * therefore forced to try to allocate a new metadata block group. + * This is because some other task allocated all available extents in the + * meanwhile - this typically happens with tasks that don't reserve space + * properly, either intentionally or as a bug. One example where this is + * done intentionally is fsync, as it does not reserve any transaction units + * and ends up allocating a variable number of metadata extents for log + * tree extent buffers; + * + * 4) The task has reserved enough transaction units / metadata space, but right + * before it tries to allocate the last extent buffer it needs, a discard + * operation comes in and, temporarily, removes the last free space entry from + * the only metadata block group that had free space (discard starts by + * removing a free space entry from a block group, then does the discard + * operation and, once it's done, it adds back the free space entry to the + * block group). + * + * We also need this 2 phases setup when adding a device to a filesystem with + * a seed device - we must create new metadata and system chunks without adding + * any of the block group items to the chunk, extent and device btrees. If we + * did not do it this way, we would get ENOSPC when attempting to update those + * btrees, since all the chunks from the seed device are read-only. + * + * Phase 1 does the updates and insertions to the chunk btree because if we had + * it done in phase 2 and have a thundering herd of tasks allocating chunks in + * parallel, we risk having too many system chunks allocated by many tasks if + * many tasks reach phase 1 without the previous ones completing phase 2. In the + * extreme case this leads to exhaustion of the system chunk array in the + * superblock. This is easier to trigger if using a btree node/leaf size of 64K + * and with RAID filesystems (so we have more device items in the chunk btree). + * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of + * the system chunk array due to concurrent allocations") provides more details. + * + * Allocation of system chunks does not happen through this function. A task that + * needs to update the chunk btree (the only btree that uses system chunks), must + * preallocate chunk space by calling either check_system_chunk() or + * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or + * metadata chunk or when removing a chunk, while the later is used before doing + * a modification to the chunk btree - use cases for the later are adding, + * removing and resizing a device as well as relocation of a system chunk. + * See the comment below for more details. + * + * The reservation of system space, done through check_system_chunk(), as well + * as all the updates and insertions into the chunk btree must be done while + * holding fs_info->chunk_mutex. This is important to guarantee that while COWing + * an extent buffer from the chunks btree we never trigger allocation of a new + * system chunk, which would result in a deadlock (trying to lock twice an + * extent buffer of the chunk btree, first time before triggering the chunk + * allocation and the second time during chunk allocation while attempting to + * update the chunks btree). The system chunk array is also updated while holding + * that mutex. The same logic applies to removing chunks - we must reserve system + * space, update the chunk btree and the system chunk array in the superblock + * while holding fs_info->chunk_mutex. + * + * This function, btrfs_chunk_alloc(), belongs to phase 1. + * + * If @force is CHUNK_ALLOC_FORCE: + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + * If @force is NOT CHUNK_ALLOC_FORCE: + * - return 0 if it doesn't need to allocate a new chunk, + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + */ +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, + enum btrfs_chunk_alloc_enum force) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_space_info *space_info; + struct btrfs_block_group *ret_bg; + bool wait_for_alloc = false; + bool should_alloc = false; + bool from_extent_allocation = false; + int ret = 0; + + if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) { + from_extent_allocation = true; + force = CHUNK_ALLOC_FORCE; + } + + /* Don't re-enter if we're already allocating a chunk */ + if (trans->allocating_chunk) + return -ENOSPC; + /* + * Allocation of system chunks can not happen through this path, as we + * could end up in a deadlock if we are allocating a data or metadata + * chunk and there is another task modifying the chunk btree. + * + * This is because while we are holding the chunk mutex, we will attempt + * to add the new chunk item to the chunk btree or update an existing + * device item in the chunk btree, while the other task that is modifying + * the chunk btree is attempting to COW an extent buffer while holding a + * lock on it and on its parent - if the COW operation triggers a system + * chunk allocation, then we can deadlock because we are holding the + * chunk mutex and we may need to access that extent buffer or its parent + * in order to add the chunk item or update a device item. + * + * Tasks that want to modify the chunk tree should reserve system space + * before updating the chunk btree, by calling either + * btrfs_reserve_chunk_metadata() or check_system_chunk(). + * It's possible that after a task reserves the space, it still ends up + * here - this happens in the cases described above at do_chunk_alloc(). + * The task will have to either retry or fail. + */ + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + return -ENOSPC; + + space_info = btrfs_find_space_info(fs_info, flags); + ASSERT(space_info); + + do { + spin_lock(&space_info->lock); + if (force < space_info->force_alloc) + force = space_info->force_alloc; + should_alloc = should_alloc_chunk(fs_info, space_info, force); + if (space_info->full) { + /* No more free physical space */ + if (should_alloc) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(&space_info->lock); + return ret; + } else if (!should_alloc) { + spin_unlock(&space_info->lock); + return 0; + } else if (space_info->chunk_alloc) { + /* + * Someone is already allocating, so we need to block + * until this someone is finished and then loop to + * recheck if we should continue with our allocation + * attempt. + */ + wait_for_alloc = true; + force = CHUNK_ALLOC_NO_FORCE; + spin_unlock(&space_info->lock); + mutex_lock(&fs_info->chunk_mutex); + mutex_unlock(&fs_info->chunk_mutex); + } else { + /* Proceed with allocation */ + space_info->chunk_alloc = 1; + wait_for_alloc = false; + spin_unlock(&space_info->lock); + } + + cond_resched(); + } while (wait_for_alloc); + + mutex_lock(&fs_info->chunk_mutex); + trans->allocating_chunk = true; + + /* + * If we have mixed data/metadata chunks we want to make sure we keep + * allocating mixed chunks instead of individual chunks. + */ + if (btrfs_mixed_space_info(space_info)) + flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); + + /* + * if we're doing a data chunk, go ahead and make sure that + * we keep a reasonable number of metadata chunks allocated in the + * FS as well. + */ + if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { + fs_info->data_chunk_allocations++; + if (!(fs_info->data_chunk_allocations % + fs_info->metadata_ratio)) + force_metadata_allocation(fs_info); + } + + ret_bg = do_chunk_alloc(trans, flags); + trans->allocating_chunk = false; + + if (IS_ERR(ret_bg)) { + ret = PTR_ERR(ret_bg); + } else if (from_extent_allocation && (flags & BTRFS_BLOCK_GROUP_DATA)) { + /* + * New block group is likely to be used soon. Try to activate + * it now. Failure is OK for now. + */ + btrfs_zone_activate(ret_bg); + } + + if (!ret) + btrfs_put_block_group(ret_bg); + + spin_lock(&space_info->lock); + if (ret < 0) { + if (ret == -ENOSPC) + space_info->full = 1; + else + goto out; + } else { + ret = 1; + space_info->max_extent_size = 0; + } + + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; +out: + space_info->chunk_alloc = 0; + spin_unlock(&space_info->lock); + mutex_unlock(&fs_info->chunk_mutex); + + return ret; +} + +static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) +{ + u64 num_dev; + + num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; + if (!num_dev) + num_dev = fs_info->fs_devices->rw_devices; + + return num_dev; +} + +static void reserve_chunk_space(struct btrfs_trans_handle *trans, + u64 bytes, + u64 type) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_space_info *info; + u64 left; + int ret = 0; + + /* + * Needed because we can end up allocating a system chunk and for an + * atomic and race free space reservation in the chunk block reserve. + */ + lockdep_assert_held(&fs_info->chunk_mutex); + + info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + spin_lock(&info->lock); + left = info->total_bytes - btrfs_space_info_used(info, true); + spin_unlock(&info->lock); + + if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", + left, bytes, type); + btrfs_dump_space_info(fs_info, info, 0, 0); + } + + if (left < bytes) { + u64 flags = btrfs_system_alloc_profile(fs_info); + struct btrfs_block_group *bg; + + /* + * Ignore failure to create system chunk. We might end up not + * needing it, as we might not need to COW all nodes/leafs from + * the paths we visit in the chunk tree (they were already COWed + * or created in the current transaction for example). + */ + bg = btrfs_create_chunk(trans, flags); + if (IS_ERR(bg)) { + ret = PTR_ERR(bg); + } else { + /* + * We have a new chunk. We also need to activate it for + * zoned filesystem. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, info, true); + if (ret < 0) + return; + + /* + * If we fail to add the chunk item here, we end up + * trying again at phase 2 of chunk allocation, at + * btrfs_create_pending_block_groups(). So ignore + * any error here. An ENOSPC here could happen, due to + * the cases described at do_chunk_alloc() - the system + * block group we just created was just turned into RO + * mode by a scrub for example, or a running discard + * temporarily removed its free space entries, etc. + */ + btrfs_chunk_alloc_add_chunk_item(trans, bg); + } + } + + if (!ret) { + ret = btrfs_block_rsv_add(fs_info, + &fs_info->chunk_block_rsv, + bytes, BTRFS_RESERVE_NO_FLUSH); + if (!ret) + trans->chunk_bytes_reserved += bytes; + } +} + +/* + * Reserve space in the system space for allocating or removing a chunk. + * The caller must be holding fs_info->chunk_mutex. + */ +void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + const u64 num_devs = get_profile_num_devs(fs_info, type); + u64 bytes; + + /* num_devs device items to update and 1 chunk item to add or remove. */ + bytes = btrfs_calc_metadata_size(fs_info, num_devs) + + btrfs_calc_insert_metadata_size(fs_info, 1); + + reserve_chunk_space(trans, bytes, type); +} + +/* + * Reserve space in the system space, if needed, for doing a modification to the + * chunk btree. + * + * @trans: A transaction handle. + * @is_item_insertion: Indicate if the modification is for inserting a new item + * in the chunk btree or if it's for the deletion or update + * of an existing item. + * + * This is used in a context where we need to update the chunk btree outside + * block group allocation and removal, to avoid a deadlock with a concurrent + * task that is allocating a metadata or data block group and therefore needs to + * update the chunk btree while holding the chunk mutex. After the update to the + * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called. + * + */ +void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, + bool is_item_insertion) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + u64 bytes; + + if (is_item_insertion) + bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + else + bytes = btrfs_calc_metadata_size(fs_info, 1); + + mutex_lock(&fs_info->chunk_mutex); + reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM); + mutex_unlock(&fs_info->chunk_mutex); +} + +void btrfs_put_block_group_cache(struct btrfs_fs_info *info) +{ + struct btrfs_block_group *block_group; + + block_group = btrfs_lookup_first_block_group(info, 0); + while (block_group) { + btrfs_wait_block_group_cache_done(block_group); + spin_lock(&block_group->lock); + if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, + &block_group->runtime_flags)) { + struct inode *inode = block_group->inode; + + block_group->inode = NULL; + spin_unlock(&block_group->lock); + + ASSERT(block_group->io_ctl.inode == NULL); + iput(inode); + } else { + spin_unlock(&block_group->lock); + } + block_group = btrfs_next_block_group(block_group); + } +} + +/* + * Must be called only after stopping all workers, since we could have block + * group caching kthreads running, and therefore they could race with us if we + * freed the block groups before stopping them. + */ +int btrfs_free_block_groups(struct btrfs_fs_info *info) +{ + struct btrfs_block_group *block_group; + struct btrfs_space_info *space_info; + struct btrfs_caching_control *caching_ctl; + struct rb_node *n; + + if (btrfs_is_zoned(info)) { + if (info->active_meta_bg) { + btrfs_put_block_group(info->active_meta_bg); + info->active_meta_bg = NULL; + } + if (info->active_system_bg) { + btrfs_put_block_group(info->active_system_bg); + info->active_system_bg = NULL; + } + } + + write_lock(&info->block_group_cache_lock); + while (!list_empty(&info->caching_block_groups)) { + caching_ctl = list_entry(info->caching_block_groups.next, + struct btrfs_caching_control, list); + list_del(&caching_ctl->list); + btrfs_put_caching_control(caching_ctl); + } + write_unlock(&info->block_group_cache_lock); + + spin_lock(&info->unused_bgs_lock); + while (!list_empty(&info->unused_bgs)) { + block_group = list_first_entry(&info->unused_bgs, + struct btrfs_block_group, + bg_list); + list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + } + + while (!list_empty(&info->reclaim_bgs)) { + block_group = list_first_entry(&info->reclaim_bgs, + struct btrfs_block_group, + bg_list); + list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&info->unused_bgs_lock); + + spin_lock(&info->zone_active_bgs_lock); + while (!list_empty(&info->zone_active_bgs)) { + block_group = list_first_entry(&info->zone_active_bgs, + struct btrfs_block_group, + active_bg_list); + list_del_init(&block_group->active_bg_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&info->zone_active_bgs_lock); + + write_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) { + block_group = rb_entry(n, struct btrfs_block_group, + cache_node); + rb_erase_cached(&block_group->cache_node, + &info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); + write_unlock(&info->block_group_cache_lock); + + down_write(&block_group->space_info->groups_sem); + list_del(&block_group->list); + up_write(&block_group->space_info->groups_sem); + + /* + * We haven't cached this block group, which means we could + * possibly have excluded extents on this block group. + */ + if (block_group->cached == BTRFS_CACHE_NO || + block_group->cached == BTRFS_CACHE_ERROR) + btrfs_free_excluded_extents(block_group); + + btrfs_remove_free_space_cache(block_group); + ASSERT(block_group->cached != BTRFS_CACHE_STARTED); + ASSERT(list_empty(&block_group->dirty_list)); + ASSERT(list_empty(&block_group->io_list)); + ASSERT(list_empty(&block_group->bg_list)); + ASSERT(refcount_read(&block_group->refs) == 1); + ASSERT(block_group->swap_extents == 0); + btrfs_put_block_group(block_group); + + write_lock(&info->block_group_cache_lock); + } + write_unlock(&info->block_group_cache_lock); + + btrfs_release_global_block_rsv(info); + + while (!list_empty(&info->space_info)) { + space_info = list_entry(info->space_info.next, + struct btrfs_space_info, + list); + + /* + * Do not hide this behind enospc_debug, this is actually + * important and indicates a real bug if this happens. + */ + if (WARN_ON(space_info->bytes_pinned > 0 || + space_info->bytes_may_use > 0)) + btrfs_dump_space_info(info, space_info, 0, 0); + + /* + * If there was a failure to cleanup a log tree, very likely due + * to an IO failure on a writeback attempt of one or more of its + * extent buffers, we could not do proper (and cheap) unaccounting + * of their reserved space, so don't warn on bytes_reserved > 0 in + * that case. + */ + if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { + if (WARN_ON(space_info->bytes_reserved > 0)) + btrfs_dump_space_info(info, space_info, 0, 0); + } + + WARN_ON(space_info->reclaim_size > 0); + list_del(&space_info->list); + btrfs_sysfs_remove_space_info(space_info); + } + return 0; +} + +void btrfs_freeze_block_group(struct btrfs_block_group *cache) +{ + atomic_inc(&cache->frozen); +} + +void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct extent_map_tree *em_tree; + struct extent_map *em; + bool cleanup; + + spin_lock(&block_group->lock); + cleanup = (atomic_dec_and_test(&block_group->frozen) && + test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)); + spin_unlock(&block_group->lock); + + if (cleanup) { + em_tree = &fs_info->mapping_tree; + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, block_group->start, + 1); + BUG_ON(!em); /* logic error, can't happen */ + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + + /* once for us and once for the tree */ + free_extent_map(em); + free_extent_map(em); + + /* + * We may have left one free space entry and other possible + * tasks trimming this block group have left 1 entry each one. + * Free them if any. + */ + btrfs_remove_free_space_cache(block_group); + } +} + +bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg) +{ + bool ret = true; + + spin_lock(&bg->lock); + if (bg->ro) + ret = false; + else + bg->swap_extents++; + spin_unlock(&bg->lock); + + return ret; +} + +void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount) +{ + spin_lock(&bg->lock); + ASSERT(!bg->ro); + ASSERT(bg->swap_extents >= amount); + bg->swap_extents -= amount; + spin_unlock(&bg->lock); +} + +enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size) +{ + if (size <= SZ_128K) + return BTRFS_BG_SZ_SMALL; + if (size <= SZ_8M) + return BTRFS_BG_SZ_MEDIUM; + return BTRFS_BG_SZ_LARGE; +} + +/* + * Handle a block group allocating an extent in a size class + * + * @bg: The block group we allocated in. + * @size_class: The size class of the allocation. + * @force_wrong_size_class: Whether we are desperate enough to allow + * mismatched size classes. + * + * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the + * case of a race that leads to the wrong size class without + * force_wrong_size_class set. + * + * find_free_extent will skip block groups with a mismatched size class until + * it really needs to avoid ENOSPC. In that case it will set + * force_wrong_size_class. However, if a block group is newly allocated and + * doesn't yet have a size class, then it is possible for two allocations of + * different sizes to race and both try to use it. The loser is caught here and + * has to retry. + */ +int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, + enum btrfs_block_group_size_class size_class, + bool force_wrong_size_class) +{ + ASSERT(size_class != BTRFS_BG_SZ_NONE); + + /* The new allocation is in the right size class, do nothing */ + if (bg->size_class == size_class) + return 0; + /* + * The new allocation is in a mismatched size class. + * This means one of two things: + * + * 1. Two tasks in find_free_extent for different size_classes raced + * and hit the same empty block_group. Make the loser try again. + * 2. A call to find_free_extent got desperate enough to set + * 'force_wrong_slab'. Don't change the size_class, but allow the + * allocation. + */ + if (bg->size_class != BTRFS_BG_SZ_NONE) { + if (force_wrong_size_class) + return 0; + return -EAGAIN; + } + /* + * The happy new block group case: the new allocation is the first + * one in the block_group so we set size_class. + */ + bg->size_class = size_class; + + return 0; +} + +bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) +{ + if (btrfs_is_zoned(bg->fs_info)) + return false; + if (!btrfs_is_block_group_data_only(bg)) + return false; + return true; +} diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h new file mode 100644 index 0000000000..2bdbcb834f --- /dev/null +++ b/fs/btrfs/block-group.h @@ -0,0 +1,370 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_BLOCK_GROUP_H +#define BTRFS_BLOCK_GROUP_H + +#include "free-space-cache.h" + +enum btrfs_disk_cache_state { + BTRFS_DC_WRITTEN, + BTRFS_DC_ERROR, + BTRFS_DC_CLEAR, + BTRFS_DC_SETUP, +}; + +enum btrfs_block_group_size_class { + /* Unset */ + BTRFS_BG_SZ_NONE, + /* 0 < size <= 128K */ + BTRFS_BG_SZ_SMALL, + /* 128K < size <= 8M */ + BTRFS_BG_SZ_MEDIUM, + /* 8M < size < BG_LENGTH */ + BTRFS_BG_SZ_LARGE, +}; + +/* + * This describes the state of the block_group for async discard. This is due + * to the two pass nature of it where extent discarding is prioritized over + * bitmap discarding. BTRFS_DISCARD_RESET_CURSOR is set when we are resetting + * between lists to prevent contention for discard state variables + * (eg. discard_cursor). + */ +enum btrfs_discard_state { + BTRFS_DISCARD_EXTENTS, + BTRFS_DISCARD_BITMAPS, + BTRFS_DISCARD_RESET_CURSOR, +}; + +/* + * Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to + * only allocate a chunk if we really need one. + * + * CHUNK_ALLOC_LIMITED means to only try and allocate one if we have very few + * chunks already allocated. This is used as part of the clustering code to + * help make sure we have a good pool of storage to cluster in, without filling + * the FS with empty chunks + * + * CHUNK_ALLOC_FORCE means it must try to allocate one + * + * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from + * find_free_extent() that also activaes the zone + */ +enum btrfs_chunk_alloc_enum { + CHUNK_ALLOC_NO_FORCE, + CHUNK_ALLOC_LIMITED, + CHUNK_ALLOC_FORCE, + CHUNK_ALLOC_FORCE_FOR_EXTENT, +}; + +/* Block group flags set at runtime */ +enum btrfs_block_group_flags { + BLOCK_GROUP_FLAG_IREF, + BLOCK_GROUP_FLAG_REMOVED, + BLOCK_GROUP_FLAG_TO_COPY, + BLOCK_GROUP_FLAG_RELOCATING_REPAIR, + BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, + BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, + /* Does the block group need to be added to the free space tree? */ + BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, + /* Indicate that the block group is placed on a sequential zone */ + BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, + /* + * Indicate that block group is in the list of new block groups of a + * transaction. + */ + BLOCK_GROUP_FLAG_NEW, +}; + +enum btrfs_caching_type { + BTRFS_CACHE_NO, + BTRFS_CACHE_STARTED, + BTRFS_CACHE_FINISHED, + BTRFS_CACHE_ERROR, +}; + +struct btrfs_caching_control { + struct list_head list; + struct mutex mutex; + wait_queue_head_t wait; + struct btrfs_work work; + struct btrfs_block_group *block_group; + /* Track progress of caching during allocation. */ + atomic_t progress; + refcount_t count; +}; + +/* Once caching_thread() finds this much free space, it will wake up waiters. */ +#define CACHING_CTL_WAKE_UP SZ_2M + +struct btrfs_block_group { + struct btrfs_fs_info *fs_info; + struct inode *inode; + spinlock_t lock; + u64 start; + u64 length; + u64 pinned; + u64 reserved; + u64 used; + u64 delalloc_bytes; + u64 bytes_super; + u64 flags; + u64 cache_generation; + u64 global_root_id; + + /* + * The last committed used bytes of this block group, if the above @used + * is still the same as @commit_used, we don't need to update block + * group item of this block group. + */ + u64 commit_used; + /* + * If the free space extent count exceeds this number, convert the block + * group to bitmaps. + */ + u32 bitmap_high_thresh; + + /* + * If the free space extent count drops below this number, convert the + * block group back to extents. + */ + u32 bitmap_low_thresh; + + /* + * It is just used for the delayed data space allocation because + * only the data space allocation and the relative metadata update + * can be done cross the transaction. + */ + struct rw_semaphore data_rwsem; + + /* For raid56, this is a full stripe, without parity */ + unsigned long full_stripe_len; + unsigned long runtime_flags; + + unsigned int ro; + + int disk_cache_state; + + /* Cache tracking stuff */ + int cached; + struct btrfs_caching_control *caching_ctl; + + struct btrfs_space_info *space_info; + + /* Free space cache stuff */ + struct btrfs_free_space_ctl *free_space_ctl; + + /* Block group cache stuff */ + struct rb_node cache_node; + + /* For block groups in the same raid type */ + struct list_head list; + + refcount_t refs; + + /* + * List of struct btrfs_free_clusters for this block group. + * Today it will only have one thing on it, but that may change + */ + struct list_head cluster_list; + + /* + * Used for several lists: + * + * 1) struct btrfs_fs_info::unused_bgs + * 2) struct btrfs_fs_info::reclaim_bgs + * 3) struct btrfs_transaction::deleted_bgs + * 4) struct btrfs_trans_handle::new_bgs + */ + struct list_head bg_list; + + /* For read-only block groups */ + struct list_head ro_list; + + /* + * When non-zero it means the block group's logical address and its + * device extents can not be reused for future block group allocations + * until the counter goes down to 0. This is to prevent them from being + * reused while some task is still using the block group after it was + * deleted - we want to make sure they can only be reused for new block + * groups after that task is done with the deleted block group. + */ + atomic_t frozen; + + /* For discard operations */ + struct list_head discard_list; + int discard_index; + u64 discard_eligible_time; + u64 discard_cursor; + enum btrfs_discard_state discard_state; + + /* For dirty block groups */ + struct list_head dirty_list; + struct list_head io_list; + + struct btrfs_io_ctl io_ctl; + + /* + * Incremented when doing extent allocations and holding a read lock + * on the space_info's groups_sem semaphore. + * Decremented when an ordered extent that represents an IO against this + * block group's range is created (after it's added to its inode's + * root's list of ordered extents) or immediately after the allocation + * if it's a metadata extent or fallocate extent (for these cases we + * don't create ordered extents). + */ + atomic_t reservations; + + /* + * Incremented while holding the spinlock *lock* by a task checking if + * it can perform a nocow write (incremented if the value for the *ro* + * field is 0). Decremented by such tasks once they create an ordered + * extent or before that if some error happens before reaching that step. + * This is to prevent races between block group relocation and nocow + * writes through direct IO. + */ + atomic_t nocow_writers; + + /* Lock for free space tree operations. */ + struct mutex free_space_lock; + + /* + * Number of extents in this block group used for swap files. + * All accesses protected by the spinlock 'lock'. + */ + int swap_extents; + + /* + * Allocation offset for the block group to implement sequential + * allocation. This is used only on a zoned filesystem. + */ + u64 alloc_offset; + u64 zone_unusable; + u64 zone_capacity; + u64 meta_write_pointer; + struct map_lookup *physical_map; + struct list_head active_bg_list; + struct work_struct zone_finish_work; + struct extent_buffer *last_eb; + enum btrfs_block_group_size_class size_class; +}; + +static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) +{ + return (block_group->start + block_group->length); +} + +static inline bool btrfs_is_block_group_data_only( + struct btrfs_block_group *block_group) +{ + /* + * In mixed mode the fragmentation is expected to be high, lowering the + * efficiency, so only proper data block groups are considered. + */ + return (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && + !(block_group->flags & BTRFS_BLOCK_GROUP_METADATA); +} + +#ifdef CONFIG_BTRFS_DEBUG +int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group); +#endif + +struct btrfs_block_group *btrfs_lookup_first_block_group( + struct btrfs_fs_info *info, u64 bytenr); +struct btrfs_block_group *btrfs_lookup_block_group( + struct btrfs_fs_info *info, u64 bytenr); +struct btrfs_block_group *btrfs_next_block_group( + struct btrfs_block_group *cache); +void btrfs_get_block_group(struct btrfs_block_group *cache); +void btrfs_put_block_group(struct btrfs_block_group *cache); +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start); +void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg); +struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, + u64 bytenr); +void btrfs_dec_nocow_writers(struct btrfs_block_group *bg); +void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); +void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, + u64 num_bytes); +int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait); +void btrfs_put_caching_control(struct btrfs_caching_control *ctl); +struct btrfs_caching_control *btrfs_get_caching_control( + struct btrfs_block_group *cache); +int btrfs_add_new_free_space(struct btrfs_block_group *block_group, + u64 start, u64 end, u64 *total_added_ret); +struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( + struct btrfs_fs_info *fs_info, + const u64 chunk_offset); +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + u64 group_start, struct extent_map *em); +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); +void btrfs_mark_bg_unused(struct btrfs_block_group *bg); +void btrfs_reclaim_bgs_work(struct work_struct *work); +void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); +void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg); +int btrfs_read_block_groups(struct btrfs_fs_info *info); +struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, + u64 type, + u64 chunk_offset, u64 size); +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); +int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, + bool do_chunk_alloc); +void btrfs_dec_block_group_ro(struct btrfs_block_group *cache); +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans); +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans); +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); +int btrfs_update_block_group(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, bool alloc); +int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, + u64 ram_bytes, u64 num_bytes, int delalloc, + bool force_wrong_size_class); +void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, + u64 num_bytes, int delalloc); +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, + enum btrfs_chunk_alloc_enum force); +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); +void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); +void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, + bool is_item_insertion); +u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); +void btrfs_put_block_group_cache(struct btrfs_fs_info *info); +int btrfs_free_block_groups(struct btrfs_fs_info *info); +int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + u64 physical, u64 **logical, int *naddrs, int *stripe_len); + +static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); +} + +static inline u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); +} + +static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); +} + +static inline int btrfs_block_group_done(struct btrfs_block_group *cache) +{ + smp_mb(); + return cache->cached == BTRFS_CACHE_FINISHED || + cache->cached == BTRFS_CACHE_ERROR; +} + +void btrfs_freeze_block_group(struct btrfs_block_group *cache); +void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); + +bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg); +void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount); + +enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); +int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, + enum btrfs_block_group_size_class size_class, + bool force_wrong_size_class); +bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg); + +#endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c new file mode 100644 index 0000000000..77684c5e0c --- /dev/null +++ b/fs/btrfs/block-rsv.c @@ -0,0 +1,567 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "misc.h" +#include "ctree.h" +#include "block-rsv.h" +#include "space-info.h" +#include "transaction.h" +#include "block-group.h" +#include "disk-io.h" +#include "fs.h" +#include "accessors.h" + +/* + * HOW DO BLOCK RESERVES WORK + * + * Think of block_rsv's as buckets for logically grouped metadata + * reservations. Each block_rsv has a ->size and a ->reserved. ->size is + * how large we want our block rsv to be, ->reserved is how much space is + * currently reserved for this block reserve. + * + * ->failfast exists for the truncate case, and is described below. + * + * NORMAL OPERATION + * + * -> Reserve + * Entrance: btrfs_block_rsv_add, btrfs_block_rsv_refill + * + * We call into btrfs_reserve_metadata_bytes() with our bytes, which is + * accounted for in space_info->bytes_may_use, and then add the bytes to + * ->reserved, and ->size in the case of btrfs_block_rsv_add. + * + * ->size is an over-estimation of how much we may use for a particular + * operation. + * + * -> Use + * Entrance: btrfs_use_block_rsv + * + * When we do a btrfs_alloc_tree_block() we call into btrfs_use_block_rsv() + * to determine the appropriate block_rsv to use, and then verify that + * ->reserved has enough space for our tree block allocation. Once + * successful we subtract fs_info->nodesize from ->reserved. + * + * -> Finish + * Entrance: btrfs_block_rsv_release + * + * We are finished with our operation, subtract our individual reservation + * from ->size, and then subtract ->size from ->reserved and free up the + * excess if there is any. + * + * There is some logic here to refill the delayed refs rsv or the global rsv + * as needed, otherwise the excess is subtracted from + * space_info->bytes_may_use. + * + * TYPES OF BLOCK RESERVES + * + * BLOCK_RSV_TRANS, BLOCK_RSV_DELOPS, BLOCK_RSV_CHUNK + * These behave normally, as described above, just within the confines of the + * lifetime of their particular operation (transaction for the whole trans + * handle lifetime, for example). + * + * BLOCK_RSV_GLOBAL + * It is impossible to properly account for all the space that may be required + * to make our extent tree updates. This block reserve acts as an overflow + * buffer in case our delayed refs reserve does not reserve enough space to + * update the extent tree. + * + * We can steal from this in some cases as well, notably on evict() or + * truncate() in order to help users recover from ENOSPC conditions. + * + * BLOCK_RSV_DELALLOC + * The individual item sizes are determined by the per-inode size + * calculations, which are described with the delalloc code. This is pretty + * straightforward, it's just the calculation of ->size encodes a lot of + * different items, and thus it gets used when updating inodes, inserting file + * extents, and inserting checksums. + * + * BLOCK_RSV_DELREFS + * We keep a running tally of how many delayed refs we have on the system. + * We assume each one of these delayed refs are going to use a full + * reservation. We use the transaction items and pre-reserve space for every + * operation, and use this reservation to refill any gap between ->size and + * ->reserved that may exist. + * + * From there it's straightforward, removing a delayed ref means we remove its + * count from ->size and free up reservations as necessary. Since this is + * the most dynamic block reserve in the system, we will try to refill this + * block reserve first with any excess returned by any other block reserve. + * + * BLOCK_RSV_EMPTY + * This is the fallback block reserve to make us try to reserve space if we + * don't have a specific bucket for this allocation. It is mostly used for + * updating the device tree and such, since that is a separate pool we're + * content to just reserve space from the space_info on demand. + * + * BLOCK_RSV_TEMP + * This is used by things like truncate and iput. We will temporarily + * allocate a block reserve, set it to some size, and then truncate bytes + * until we have no space left. With ->failfast set we'll simply return + * ENOSPC from btrfs_use_block_rsv() to signal that we need to unwind and try + * to make a new reservation. This is because these operations are + * unbounded, so we want to do as much work as we can, and then back off and + * re-reserve. + */ + +static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + struct btrfs_block_rsv *dest, u64 num_bytes, + u64 *qgroup_to_release_ret) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + u64 qgroup_to_release = 0; + u64 ret; + + spin_lock(&block_rsv->lock); + if (num_bytes == (u64)-1) { + num_bytes = block_rsv->size; + qgroup_to_release = block_rsv->qgroup_rsv_size; + } + block_rsv->size -= num_bytes; + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + block_rsv->reserved = block_rsv->size; + block_rsv->full = true; + } else { + num_bytes = 0; + } + if (qgroup_to_release_ret && + block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { + qgroup_to_release = block_rsv->qgroup_rsv_reserved - + block_rsv->qgroup_rsv_size; + block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; + } else { + qgroup_to_release = 0; + } + spin_unlock(&block_rsv->lock); + + ret = num_bytes; + if (num_bytes > 0) { + if (dest) { + spin_lock(&dest->lock); + if (!dest->full) { + u64 bytes_to_add; + + bytes_to_add = dest->size - dest->reserved; + bytes_to_add = min(num_bytes, bytes_to_add); + dest->reserved += bytes_to_add; + if (dest->reserved >= dest->size) + dest->full = true; + num_bytes -= bytes_to_add; + } + spin_unlock(&dest->lock); + } + if (num_bytes) + btrfs_space_info_free_bytes_may_use(fs_info, + space_info, + num_bytes); + } + if (qgroup_to_release_ret) + *qgroup_to_release_ret = qgroup_to_release; + return ret; +} + +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, + struct btrfs_block_rsv *dst, u64 num_bytes, + bool update_size) +{ + int ret; + + ret = btrfs_block_rsv_use_bytes(src, num_bytes); + if (ret) + return ret; + + btrfs_block_rsv_add_bytes(dst, num_bytes, update_size); + return 0; +} + +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type) +{ + memset(rsv, 0, sizeof(*rsv)); + spin_lock_init(&rsv->lock); + rsv->type = type; +} + +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv, + enum btrfs_rsv_type type) +{ + btrfs_init_block_rsv(rsv, type); + rsv->space_info = btrfs_find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); +} + +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, + enum btrfs_rsv_type type) +{ + struct btrfs_block_rsv *block_rsv; + + block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); + if (!block_rsv) + return NULL; + + btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); + return block_rsv; +} + +void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv) +{ + if (!rsv) + return; + btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL); + kfree(rsv); +} + +int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush) +{ + int ret; + + if (num_bytes == 0) + return 0; + + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); + if (!ret) + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); + + return ret; +} + +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent) +{ + u64 num_bytes = 0; + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + num_bytes = mult_perc(block_rsv->size, min_percent); + if (block_rsv->reserved >= num_bytes) + ret = 0; + spin_unlock(&block_rsv->lock); + + return ret; +} + +int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush) +{ + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved >= num_bytes) + ret = 0; + else + num_bytes -= block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (!ret) + return 0; + + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); + if (!ret) { + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); + return 0; + } + + return ret; +} + +u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + u64 *qgroup_to_release) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *target = NULL; + + /* + * If we are the delayed_rsv then push to the global rsv, otherwise dump + * into the delayed rsv if it is not full. + */ + if (block_rsv == delayed_rsv) + target = global_rsv; + else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv)) + target = delayed_rsv; + + if (target && block_rsv->space_info != target->space_info) + target = NULL; + + return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, + qgroup_to_release); +} + +int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes) +{ + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved >= num_bytes) { + block_rsv->reserved -= num_bytes; + if (block_rsv->reserved < block_rsv->size) + block_rsv->full = false; + ret = 0; + } + spin_unlock(&block_rsv->lock); + return ret; +} + +void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, bool update_size) +{ + spin_lock(&block_rsv->lock); + block_rsv->reserved += num_bytes; + if (update_size) + block_rsv->size += num_bytes; + else if (block_rsv->reserved >= block_rsv->size) + block_rsv->full = true; + spin_unlock(&block_rsv->lock); +} + +void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + struct btrfs_space_info *sinfo = block_rsv->space_info; + struct btrfs_root *root, *tmp; + u64 num_bytes = btrfs_root_used(&fs_info->tree_root->root_item); + unsigned int min_items = 1; + + /* + * The global block rsv is based on the size of the extent tree, the + * checksum tree and the root tree. If the fs is empty we want to set + * it to a minimal amount for safety. + * + * We also are going to need to modify the minimum of the tree root and + * any global roots we could touch. + */ + read_lock(&fs_info->global_root_lock); + rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree, + rb_node) { + if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID || + root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID || + root->root_key.objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) { + num_bytes += btrfs_root_used(&root->root_item); + min_items++; + } + } + read_unlock(&fs_info->global_root_lock); + + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) { + num_bytes += btrfs_root_used(&fs_info->block_group_root->root_item); + min_items++; + } + + /* + * But we also want to reserve enough space so we can do the fallback + * global reserve for an unlink, which is an additional + * BTRFS_UNLINK_METADATA_UNITS items. + * + * But we also need space for the delayed ref updates from the unlink, + * so add BTRFS_UNLINK_METADATA_UNITS units for delayed refs, one for + * each unlink metadata item. + */ + min_items += BTRFS_UNLINK_METADATA_UNITS; + + num_bytes = max_t(u64, num_bytes, + btrfs_calc_insert_metadata_size(fs_info, min_items) + + btrfs_calc_delayed_ref_bytes(fs_info, + BTRFS_UNLINK_METADATA_UNITS)); + + spin_lock(&sinfo->lock); + spin_lock(&block_rsv->lock); + + block_rsv->size = min_t(u64, num_bytes, SZ_512M); + + if (block_rsv->reserved < block_rsv->size) { + num_bytes = block_rsv->size - block_rsv->reserved; + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, + num_bytes); + block_rsv->reserved = block_rsv->size; + } else if (block_rsv->reserved > block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, + -num_bytes); + block_rsv->reserved = block_rsv->size; + btrfs_try_granting_tickets(fs_info, sinfo); + } + + block_rsv->full = (block_rsv->reserved == block_rsv->size); + + if (block_rsv->size >= sinfo->total_bytes) + sinfo->force_alloc = CHUNK_ALLOC_FORCE; + spin_unlock(&block_rsv->lock); + spin_unlock(&sinfo->lock); +} + +void btrfs_init_root_block_rsv(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + switch (root->root_key.objectid) { + case BTRFS_CSUM_TREE_OBJECTID: + case BTRFS_EXTENT_TREE_OBJECTID: + case BTRFS_FREE_SPACE_TREE_OBJECTID: + case BTRFS_BLOCK_GROUP_TREE_OBJECTID: + root->block_rsv = &fs_info->delayed_refs_rsv; + break; + case BTRFS_ROOT_TREE_OBJECTID: + case BTRFS_DEV_TREE_OBJECTID: + case BTRFS_QUOTA_TREE_OBJECTID: + root->block_rsv = &fs_info->global_block_rsv; + break; + case BTRFS_CHUNK_TREE_OBJECTID: + root->block_rsv = &fs_info->chunk_block_rsv; + break; + default: + root->block_rsv = NULL; + break; + } +} + +void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + fs_info->chunk_block_rsv.space_info = space_info; + + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + fs_info->global_block_rsv.space_info = space_info; + fs_info->trans_block_rsv.space_info = space_info; + fs_info->empty_block_rsv.space_info = space_info; + fs_info->delayed_block_rsv.space_info = space_info; + fs_info->delayed_refs_rsv.space_info = space_info; + + btrfs_update_global_block_rsv(fs_info); +} + +void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1, + NULL); + WARN_ON(fs_info->trans_block_rsv.size > 0); + WARN_ON(fs_info->trans_block_rsv.reserved > 0); + WARN_ON(fs_info->chunk_block_rsv.size > 0); + WARN_ON(fs_info->chunk_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_block_rsv.size > 0); + WARN_ON(fs_info->delayed_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.size > 0); +} + +static struct btrfs_block_rsv *get_block_rsv( + const struct btrfs_trans_handle *trans, + const struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *block_rsv = NULL; + + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || + (root == fs_info->uuid_root) || + (trans->adding_csums && + root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID)) + block_rsv = trans->block_rsv; + + if (!block_rsv) + block_rsv = root->block_rsv; + + if (!block_rsv) + block_rsv = &fs_info->empty_block_rsv; + + return block_rsv; +} + +struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u32 blocksize) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + int ret; + bool global_updated = false; + + block_rsv = get_block_rsv(trans, root); + + if (unlikely(block_rsv->size == 0)) + goto try_reserve; +again: + ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize); + if (!ret) + return block_rsv; + + if (block_rsv->failfast) + return ERR_PTR(ret); + + if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { + global_updated = true; + btrfs_update_global_block_rsv(fs_info); + goto again; + } + + /* + * The global reserve still exists to save us from ourselves, so don't + * warn_on if we are short on our delayed refs reserve. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && + btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL * 10, + /*DEFAULT_RATELIMIT_BURST*/ 1); + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG + "BTRFS: block rsv %d returned %d\n", + block_rsv->type, ret); + } +try_reserve: + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); + if (!ret) + return block_rsv; + /* + * If we couldn't reserve metadata bytes try and use some from + * the global reserve if its space type is the same as the global + * reservation. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && + block_rsv->space_info == global_rsv->space_info) { + ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + } + + /* + * All hope is lost, but of course our reservations are overly + * pessimistic, so instead of possibly having an ENOSPC abort here, try + * one last time to force a reservation if there's enough actual space + * on disk to make the reservation. + */ + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize, + BTRFS_RESERVE_FLUSH_EMERGENCY); + if (!ret) + return block_rsv; + + return ERR_PTR(ret); +} + +int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv) +{ + u64 needed_bytes; + int ret; + + /* 1 for slack space, 1 for updating the inode */ + needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) + + btrfs_calc_metadata_size(fs_info, 1); + + spin_lock(&rsv->lock); + if (rsv->reserved < needed_bytes) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(&rsv->lock); + return ret; +} diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h new file mode 100644 index 0000000000..b0bd12b865 --- /dev/null +++ b/fs/btrfs/block-rsv.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_BLOCK_RSV_H +#define BTRFS_BLOCK_RSV_H + +struct btrfs_trans_handle; +struct btrfs_root; +enum btrfs_reserve_flush_enum; + +/* + * Types of block reserves + */ +enum btrfs_rsv_type { + BTRFS_BLOCK_RSV_GLOBAL, + BTRFS_BLOCK_RSV_DELALLOC, + BTRFS_BLOCK_RSV_TRANS, + BTRFS_BLOCK_RSV_CHUNK, + BTRFS_BLOCK_RSV_DELOPS, + BTRFS_BLOCK_RSV_DELREFS, + BTRFS_BLOCK_RSV_EMPTY, + BTRFS_BLOCK_RSV_TEMP, +}; + +struct btrfs_block_rsv { + u64 size; + u64 reserved; + struct btrfs_space_info *space_info; + spinlock_t lock; + bool full; + bool failfast; + /* Block reserve type, one of BTRFS_BLOCK_RSV_* */ + enum btrfs_rsv_type type:8; + + /* + * Qgroup equivalent for @size @reserved + * + * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care + * about things like csum size nor how many tree blocks it will need to + * reserve. + * + * Qgroup cares more about net change of the extent usage. + * + * So for one newly inserted file extent, in worst case it will cause + * leaf split and level increase, nodesize for each file extent is + * already too much. + * + * In short, qgroup_size/reserved is the upper limit of possible needed + * qgroup metadata reservation. + */ + u64 qgroup_rsv_size; + u64 qgroup_rsv_reserved; +}; + +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type); +void btrfs_init_root_block_rsv(struct btrfs_root *root); +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, + enum btrfs_rsv_type type); +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv, + enum btrfs_rsv_type type); +void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv); +int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush); +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent); +int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush); +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, u64 num_bytes, + bool update_size); +int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); +void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, bool update_size); +u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, u64 *qgroup_to_release); +void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info); +void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info); +void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); +struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u32 blocksize); +int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv); +static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u32 blocksize) +{ + btrfs_block_rsv_add_bytes(block_rsv, blocksize, false); + btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL); +} + +/* + * Fast path to check if the reserve is full, may be carefully used outside of + * locks. + */ +static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv) +{ + return data_race(rsv->full); +} + +#endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h new file mode 100644 index 0000000000..bda1fdbba6 --- /dev/null +++ b/fs/btrfs/btrfs_inode.h @@ -0,0 +1,534 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#ifndef BTRFS_INODE_H +#define BTRFS_INODE_H + +#include +#include +#include "extent_map.h" +#include "extent_io.h" +#include "ordered-data.h" +#include "delayed-inode.h" + +/* + * Since we search a directory based on f_pos (struct dir_context::pos) we have + * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so + * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()). + */ +#define BTRFS_DIR_START_INDEX 2 + +/* + * ordered_data_close is set by truncate when a file that used + * to have good data has been truncated to zero. When it is set + * the btrfs file release call will add this inode to the + * ordered operations list so that we make sure to flush out any + * new data the application may have written before commit. + */ +enum { + BTRFS_INODE_FLUSH_ON_CLOSE, + BTRFS_INODE_DUMMY, + BTRFS_INODE_IN_DEFRAG, + BTRFS_INODE_HAS_ASYNC_EXTENT, + /* + * Always set under the VFS' inode lock, otherwise it can cause races + * during fsync (we start as a fast fsync and then end up in a full + * fsync racing with ordered extent completion). + */ + BTRFS_INODE_NEEDS_FULL_SYNC, + BTRFS_INODE_COPY_EVERYTHING, + BTRFS_INODE_IN_DELALLOC_LIST, + BTRFS_INODE_HAS_PROPS, + BTRFS_INODE_SNAPSHOT_FLUSH, + /* + * Set and used when logging an inode and it serves to signal that an + * inode does not have xattrs, so subsequent fsyncs can avoid searching + * for xattrs to log. This bit must be cleared whenever a xattr is added + * to an inode. + */ + BTRFS_INODE_NO_XATTRS, + /* + * Set when we are in a context where we need to start a transaction and + * have dirty pages with the respective file range locked. This is to + * ensure that when reserving space for the transaction, if we are low + * on available space and need to flush delalloc, we will not flush + * delalloc for this inode, because that could result in a deadlock (on + * the file range, inode's io_tree). + */ + BTRFS_INODE_NO_DELALLOC_FLUSH, + /* + * Set when we are working on enabling verity for a file. Computing and + * writing the whole Merkle tree can take a while so we want to prevent + * races where two separate tasks attempt to simultaneously start verity + * on the same file. + */ + BTRFS_INODE_VERITY_IN_PROGRESS, + /* Set when this inode is a free space inode. */ + BTRFS_INODE_FREE_SPACE_INODE, +}; + +/* in memory btrfs inode */ +struct btrfs_inode { + /* which subvolume this inode belongs to */ + struct btrfs_root *root; + + /* key used to find this inode on disk. This is used by the code + * to read in roots of subvolumes + */ + struct btrfs_key location; + + /* + * Lock for counters and all fields used to determine if the inode is in + * the log or not (last_trans, last_sub_trans, last_log_commit, + * logged_trans), to access/update new_delalloc_bytes and to update the + * VFS' inode number of bytes used. + */ + spinlock_t lock; + + /* the extent_tree has caches of all the extent mappings to disk */ + struct extent_map_tree extent_tree; + + /* the io_tree does range state (DIRTY, LOCKED etc) */ + struct extent_io_tree io_tree; + + /* + * Keep track of where the inode has extent items mapped in order to + * make sure the i_size adjustments are accurate + */ + struct extent_io_tree file_extent_tree; + + /* held while logging the inode in tree-log.c */ + struct mutex log_mutex; + + /* used to order data wrt metadata */ + struct btrfs_ordered_inode_tree ordered_tree; + + /* list of all the delalloc inodes in the FS. There are times we need + * to write all the delalloc pages to disk, and this list is used + * to walk them all. + */ + struct list_head delalloc_inodes; + + /* node for the red-black tree that links inodes in subvolume root */ + struct rb_node rb_node; + + unsigned long runtime_flags; + + /* full 64 bit generation number, struct vfs_inode doesn't have a big + * enough field for this. + */ + u64 generation; + + /* + * transid of the trans_handle that last modified this inode + */ + u64 last_trans; + + /* + * transid that last logged this inode + */ + u64 logged_trans; + + /* + * log transid when this inode was last modified + */ + int last_sub_trans; + + /* a local copy of root's last_log_commit */ + int last_log_commit; + + union { + /* + * Total number of bytes pending delalloc, used by stat to + * calculate the real block usage of the file. This is used + * only for files. + */ + u64 delalloc_bytes; + /* + * The lowest possible index of the next dir index key which + * points to an inode that needs to be logged. + * This is used only for directories. + * Use the helpers btrfs_get_first_dir_index_to_log() and + * btrfs_set_first_dir_index_to_log() to access this field. + */ + u64 first_dir_index_to_log; + }; + + union { + /* + * Total number of bytes pending delalloc that fall within a file + * range that is either a hole or beyond EOF (and no prealloc extent + * exists in the range). This is always <= delalloc_bytes and this + * is used only for files. + */ + u64 new_delalloc_bytes; + /* + * The offset of the last dir index key that was logged. + * This is used only for directories. + */ + u64 last_dir_index_offset; + }; + + /* + * total number of bytes pending defrag, used by stat to check whether + * it needs COW. + */ + u64 defrag_bytes; + + /* + * the size of the file stored in the metadata on disk. data=ordered + * means the in-memory i_size might be larger than the size on disk + * because not all the blocks are written yet. + */ + u64 disk_i_size; + + /* + * If this is a directory then index_cnt is the counter for the index + * number for new files that are created. For an empty directory, this + * must be initialized to BTRFS_DIR_START_INDEX. + */ + u64 index_cnt; + + /* Cache the directory index number to speed the dir/file remove */ + u64 dir_index; + + /* the fsync log has some corner cases that mean we have to check + * directories to see if any unlinks have been done before + * the directory was logged. See tree-log.c for all the + * details + */ + u64 last_unlink_trans; + + /* + * The id/generation of the last transaction where this inode was + * either the source or the destination of a clone/dedupe operation. + * Used when logging an inode to know if there are shared extents that + * need special care when logging checksum items, to avoid duplicate + * checksum items in a log (which can lead to a corruption where we end + * up with missing checksum ranges after log replay). + * Protected by the vfs inode lock. + */ + u64 last_reflink_trans; + + /* + * Number of bytes outstanding that are going to need csums. This is + * used in ENOSPC accounting. + */ + u64 csum_bytes; + + /* Backwards incompatible flags, lower half of inode_item::flags */ + u32 flags; + /* Read-only compatibility flags, upper half of inode_item::flags */ + u32 ro_flags; + + /* + * Counters to keep track of the number of extent item's we may use due + * to delalloc and such. outstanding_extents is the number of extent + * items we think we'll end up using, and reserved_extents is the number + * of extent items we've reserved metadata for. + */ + unsigned outstanding_extents; + + struct btrfs_block_rsv block_rsv; + + /* + * Cached values of inode properties + */ + unsigned prop_compress; /* per-file compression algorithm */ + /* + * Force compression on the file using the defrag ioctl, could be + * different from prop_compress and takes precedence if set + */ + unsigned defrag_compress; + + struct btrfs_delayed_node *delayed_node; + + /* File creation time. */ + struct timespec64 i_otime; + + /* Hook into fs_info->delayed_iputs */ + struct list_head delayed_iput; + + struct rw_semaphore i_mmap_lock; + struct inode vfs_inode; +}; + +static inline u64 btrfs_get_first_dir_index_to_log(const struct btrfs_inode *inode) +{ + return READ_ONCE(inode->first_dir_index_to_log); +} + +static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode, + u64 index) +{ + WRITE_ONCE(inode->first_dir_index_to_log, index); +} + +static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) +{ + return container_of(inode, struct btrfs_inode, vfs_inode); +} + +static inline unsigned long btrfs_inode_hash(u64 objectid, + const struct btrfs_root *root) +{ + u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME); + +#if BITS_PER_LONG == 32 + h = (h >> 32) ^ (h & 0xffffffff); +#endif + + return (unsigned long)h; +} + +#if BITS_PER_LONG == 32 + +/* + * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so + * we use the inode's location objectid which is a u64 to avoid truncation. + */ +static inline u64 btrfs_ino(const struct btrfs_inode *inode) +{ + u64 ino = inode->location.objectid; + + /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */ + if (inode->location.type == BTRFS_ROOT_ITEM_KEY) + ino = inode->vfs_inode.i_ino; + return ino; +} + +#else + +static inline u64 btrfs_ino(const struct btrfs_inode *inode) +{ + return inode->vfs_inode.i_ino; +} + +#endif + +static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size) +{ + i_size_write(&inode->vfs_inode, size); + inode->disk_i_size = size; +} + +static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) +{ + return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags); +} + +static inline bool is_data_inode(struct inode *inode) +{ + return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID; +} + +static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, + int mod) +{ + lockdep_assert_held(&inode->lock); + inode->outstanding_extents += mod; + if (btrfs_is_free_space_inode(inode)) + return; + trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode), + mod, inode->outstanding_extents); +} + +/* + * Called every time after doing a buffered, direct IO or memory mapped write. + * + * This is to ensure that if we write to a file that was previously fsynced in + * the current transaction, then try to fsync it again in the same transaction, + * we will know that there were changes in the file and that it needs to be + * logged. + */ +static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode) +{ + spin_lock(&inode->lock); + inode->last_sub_trans = inode->root->log_transid; + spin_unlock(&inode->lock); +} + +/* + * Should be called while holding the inode's VFS lock in exclusive mode or in a + * context where no one else can access the inode concurrently (during inode + * creation or when loading an inode from disk). + */ +static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode) +{ + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + /* + * The inode may have been part of a reflink operation in the last + * transaction that modified it, and then a fsync has reset the + * last_reflink_trans to avoid subsequent fsyncs in the same + * transaction to do unnecessary work. So update last_reflink_trans + * to the last_trans value (we have to be pessimistic and assume a + * reflink happened). + * + * The ->last_trans is protected by the inode's spinlock and we can + * have a concurrent ordered extent completion update it. Also set + * last_reflink_trans to ->last_trans only if the former is less than + * the later, because we can be called in a context where + * last_reflink_trans was set to the current transaction generation + * while ->last_trans was not yet updated in the current transaction, + * and therefore has a lower value. + */ + spin_lock(&inode->lock); + if (inode->last_reflink_trans < inode->last_trans) + inode->last_reflink_trans = inode->last_trans; + spin_unlock(&inode->lock); +} + +static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) +{ + bool ret = false; + + spin_lock(&inode->lock); + if (inode->logged_trans == generation && + inode->last_sub_trans <= inode->last_log_commit && + inode->last_sub_trans <= inode->root->last_log_commit) + ret = true; + spin_unlock(&inode->lock); + return ret; +} + +/* + * Check if the inode has flags compatible with compression + */ +static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) +{ + if (inode->flags & BTRFS_INODE_NODATACOW || + inode->flags & BTRFS_INODE_NODATASUM) + return false; + return true; +} + +/* Array of bytes with variable length, hexadecimal format 0x1234 */ +#define CSUM_FMT "0x%*phN" +#define CSUM_FMT_VALUE(size, bytes) size, bytes + +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected); +bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, + u32 bio_offset, struct bio_vec *bv); +noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes, bool nowait, bool strict); + +void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); +int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, struct btrfs_inode *inode, + const struct fscrypt_str *name); +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct btrfs_inode *parent_inode, struct btrfs_inode *inode, + const struct fscrypt_str *name, int add_backref, u64 index); +int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry); +int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, + int front); + +int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, + bool in_reclaim_context); +int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, + unsigned int extra_bits, + struct extent_state **cached_state); + +struct btrfs_new_inode_args { + /* Input */ + struct inode *dir; + struct dentry *dentry; + struct inode *inode; + bool orphan; + bool subvol; + + /* Output from btrfs_new_inode_prepare(), input to btrfs_create_new_inode(). */ + struct posix_acl *default_acl; + struct posix_acl *acl; + struct fscrypt_name fname; +}; + +int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, + unsigned int *trans_num_items); +int btrfs_create_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_new_inode_args *args); +void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); +struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap, + struct inode *dir); + void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, + u32 bits); +void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, + struct extent_state *state, u32 bits); +void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, + struct extent_state *other); +void btrfs_split_delalloc_extent(struct btrfs_inode *inode, + struct extent_state *orig, u64 split); +void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); +vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); +void btrfs_evict_inode(struct inode *inode); +struct inode *btrfs_alloc_inode(struct super_block *sb); +void btrfs_destroy_inode(struct inode *inode); +void btrfs_free_inode(struct inode *inode); +int btrfs_drop_inode(struct inode *inode); +int __init btrfs_init_cachep(void); +void __cold btrfs_destroy_cachep(void); +struct inode *btrfs_iget_path(struct super_block *s, u64 ino, + struct btrfs_root *root, struct btrfs_path *path); +struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); +struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, + struct page *page, size_t pg_offset, + u64 start, u64 end); +int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode); +int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode); +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); +int btrfs_orphan_cleanup(struct btrfs_root *root); +int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); +void btrfs_add_delayed_iput(struct btrfs_inode *inode); +void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); +int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); +int btrfs_prealloc_file_range_trans(struct inode *inode, + struct btrfs_trans_handle *trans, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, + u64 start, u64 end, struct writeback_control *wbc); +int btrfs_writepage_cow_fixup(struct page *page); +int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, + int compress_type); +int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, + struct page **pages); +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, + struct btrfs_ioctl_encoded_io_args *encoded); +ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); + +ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before); +struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before); + +extern const struct dentry_operations btrfs_dentry_operations; + +/* Inode locking type flags, by default the exclusive lock is taken. */ +enum btrfs_ilock_type { + ENUM_BIT(BTRFS_ILOCK_SHARED), + ENUM_BIT(BTRFS_ILOCK_TRY), + ENUM_BIT(BTRFS_ILOCK_MMAP), +}; + +int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags); +void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags); +void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, + const u64 del_bytes); +void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); + +#endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c new file mode 100644 index 0000000000..3caf339c4b --- /dev/null +++ b/fs/btrfs/check-integrity.c @@ -0,0 +1,2871 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) STRATO AG 2011. All rights reserved. + */ + +/* + * This module can be used to catch cases when the btrfs kernel + * code executes write requests to the disk that bring the file + * system in an inconsistent state. In such a state, a power-loss + * or kernel panic event would cause that the data on disk is + * lost or at least damaged. + * + * Code is added that examines all block write requests during + * runtime (including writes of the super block). Three rules + * are verified and an error is printed on violation of the + * rules: + * 1. It is not allowed to write a disk block which is + * currently referenced by the super block (either directly + * or indirectly). + * 2. When a super block is written, it is verified that all + * referenced (directly or indirectly) blocks fulfill the + * following requirements: + * 2a. All referenced blocks have either been present when + * the file system was mounted, (i.e., they have been + * referenced by the super block) or they have been + * written since then and the write completion callback + * was called and no write error was indicated and a + * FLUSH request to the device where these blocks are + * located was received and completed. + * 2b. All referenced blocks need to have a generation + * number which is equal to the parent's number. + * + * One issue that was found using this module was that the log + * tree on disk became temporarily corrupted because disk blocks + * that had been in use for the log tree had been freed and + * reused too early, while being referenced by the written super + * block. + * + * The search term in the kernel log that can be used to filter + * on the existence of detected integrity issues is + * "btrfs: attempt". + * + * The integrity check is enabled via mount options. These + * mount options are only supported if the integrity check + * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY. + * + * Example #1, apply integrity checks to all metadata: + * mount /dev/sdb1 /mnt -o check_int + * + * Example #2, apply integrity checks to all metadata and + * to data extents: + * mount /dev/sdb1 /mnt -o check_int_data + * + * Example #3, apply integrity checks to all metadata and dump + * the tree that the super block references to kernel messages + * each time after a super block was written: + * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263 + * + * If the integrity check tool is included and activated in + * the mount options, plenty of kernel memory is used, and + * plenty of additional CPU cycles are spent. Enabling this + * functionality is not intended for normal use. In most + * cases, unless you are a btrfs developer who needs to verify + * the integrity of (super)-block write requests, do not + * enable the config option BTRFS_FS_CHECK_INTEGRITY to + * include and compile the integrity check tool. + * + * Expect millions of lines of information in the kernel log with an + * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the + * kernel config to at least 26 (which is 64MB). Usually the value is + * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be + * changed like this before LOG_BUF_SHIFT can be set to a high value: + * config LOG_BUF_SHIFT + * int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" + * range 12 30 + */ + +#include +#include +#include +#include +#include +#include +#include +#include "messages.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "extent_io.h" +#include "volumes.h" +#include "print-tree.h" +#include "locking.h" +#include "check-integrity.h" +#include "rcu-string.h" +#include "compression.h" +#include "accessors.h" + +#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 +#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 +#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100 +#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051 +#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807 +#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530 +#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 +#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, + * excluding " [...]" */ +#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) + +/* + * The definition of the bitmask fields for the print_mask. + * They are specified with the mount option check_integrity_print_mask. + */ +#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001 +#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002 +#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004 +#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008 +#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010 +#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020 +#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040 +#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080 +#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100 +#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200 +#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 +#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 +#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 +#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE 0x00002000 + +struct btrfsic_dev_state; +struct btrfsic_state; + +struct btrfsic_block { + u32 magic_num; /* only used for debug purposes */ + unsigned int is_metadata:1; /* if it is meta-data, not data-data */ + unsigned int is_superblock:1; /* if it is one of the superblocks */ + unsigned int is_iodone:1; /* if is done by lower subsystem */ + unsigned int iodone_w_error:1; /* error was indicated to endio */ + unsigned int never_written:1; /* block was added because it was + * referenced, not because it was + * written */ + unsigned int mirror_num; /* large enough to hold + * BTRFS_SUPER_MIRROR_MAX */ + struct btrfsic_dev_state *dev_state; + u64 dev_bytenr; /* key, physical byte num on disk */ + u64 logical_bytenr; /* logical byte num on disk */ + u64 generation; + struct btrfs_disk_key disk_key; /* extra info to print in case of + * issues, will not always be correct */ + struct list_head collision_resolving_node; /* list node */ + struct list_head all_blocks_node; /* list node */ + + /* the following two lists contain block_link items */ + struct list_head ref_to_list; /* list */ + struct list_head ref_from_list; /* list */ + struct btrfsic_block *next_in_same_bio; + void *orig_bio_private; + bio_end_io_t *orig_bio_end_io; + blk_opf_t submit_bio_bh_rw; + u64 flush_gen; /* only valid if !never_written */ +}; + +/* + * Elements of this type are allocated dynamically and required because + * each block object can refer to and can be ref from multiple blocks. + * The key to lookup them in the hashtable is the dev_bytenr of + * the block ref to plus the one from the block referred from. + * The fact that they are searchable via a hashtable and that a + * ref_cnt is maintained is not required for the btrfs integrity + * check algorithm itself, it is only used to make the output more + * beautiful in case that an error is detected (an error is defined + * as a write operation to a block while that block is still referenced). + */ +struct btrfsic_block_link { + u32 magic_num; /* only used for debug purposes */ + u32 ref_cnt; + struct list_head node_ref_to; /* list node */ + struct list_head node_ref_from; /* list node */ + struct list_head collision_resolving_node; /* list node */ + struct btrfsic_block *block_ref_to; + struct btrfsic_block *block_ref_from; + u64 parent_generation; +}; + +struct btrfsic_dev_state { + u32 magic_num; /* only used for debug purposes */ + struct block_device *bdev; + struct btrfsic_state *state; + struct list_head collision_resolving_node; /* list node */ + struct btrfsic_block dummy_block_for_bio_bh_flush; + u64 last_flush_gen; +}; + +struct btrfsic_block_hashtable { + struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE]; +}; + +struct btrfsic_block_link_hashtable { + struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE]; +}; + +struct btrfsic_dev_state_hashtable { + struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE]; +}; + +struct btrfsic_block_data_ctx { + u64 start; /* virtual bytenr */ + u64 dev_bytenr; /* physical bytenr on device */ + u32 len; + struct btrfsic_dev_state *dev; + char **datav; + struct page **pagev; + void *mem_to_free; +}; + +/* This structure is used to implement recursion without occupying + * any stack space, refer to btrfsic_process_metablock() */ +struct btrfsic_stack_frame { + u32 magic; + u32 nr; + int error; + int i; + int limit_nesting; + int num_copies; + int mirror_num; + struct btrfsic_block *block; + struct btrfsic_block_data_ctx *block_ctx; + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx next_block_ctx; + struct btrfs_header *hdr; + struct btrfsic_stack_frame *prev; +}; + +/* Some state per mounted filesystem */ +struct btrfsic_state { + u32 print_mask; + int include_extent_data; + struct list_head all_blocks_list; + struct btrfsic_block_hashtable block_hashtable; + struct btrfsic_block_link_hashtable block_link_hashtable; + struct btrfs_fs_info *fs_info; + u64 max_superblock_generation; + struct btrfsic_block *latest_superblock; + u32 metablock_size; + u32 datablock_size; +}; + +static int btrfsic_process_metablock(struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + int limit_nesting, int force_iodone_flag); +static void btrfsic_read_from_block_data( + struct btrfsic_block_data_ctx *block_ctx, + void *dst, u32 offset, size_t len); +static int btrfsic_create_link_to_next_block( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx + *block_ctx, u64 next_bytenr, + int limit_nesting, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block **next_blockp, + int force_iodone_flag, + int *num_copiesp, int *mirror_nump, + struct btrfs_disk_key *disk_key, + u64 parent_generation); +static int btrfsic_handle_extent_data(struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u32 item_offset, int force_iodone_flag); +static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, + struct btrfsic_block_data_ctx *block_ctx_out, + int mirror_num); +static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); +static int btrfsic_read_block(struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx); +static int btrfsic_process_written_superblock( + struct btrfsic_state *state, + struct btrfsic_block *const block, + struct btrfs_super_block *const super_hdr); +static void btrfsic_bio_end_io(struct bio *bp); +static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int recursion_level); +static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, + struct btrfsic_block *const block, + int recursion_level); +static void btrfsic_print_add_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l); +static void btrfsic_print_rem_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l); +static char btrfsic_get_block_type(const struct btrfsic_state *state, + const struct btrfsic_block *block); +static void btrfsic_dump_tree(const struct btrfsic_state *state); +static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int indent_level); +static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block *next_block, + struct btrfsic_block *from_block, + u64 parent_generation); +static struct btrfsic_block *btrfsic_block_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx, + const char *additional_string, + int is_metadata, + int is_iodone, + int never_written, + int mirror_num, + int *was_created); +static int btrfsic_process_superblock_dev_mirror( + struct btrfsic_state *state, + struct btrfsic_dev_state *dev_state, + struct btrfs_device *device, + int superblock_mirror_num, + struct btrfsic_dev_state **selected_dev_state, + struct btrfs_super_block *selected_super); +static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev); +static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, + u64 bytenr, + struct btrfsic_dev_state *dev_state, + u64 dev_bytenr); + +static struct mutex btrfsic_mutex; +static int btrfsic_is_initialized; +static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable; + + +static void btrfsic_block_init(struct btrfsic_block *b) +{ + b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER; + b->dev_state = NULL; + b->dev_bytenr = 0; + b->logical_bytenr = 0; + b->generation = BTRFSIC_GENERATION_UNKNOWN; + b->disk_key.objectid = 0; + b->disk_key.type = 0; + b->disk_key.offset = 0; + b->is_metadata = 0; + b->is_superblock = 0; + b->is_iodone = 0; + b->iodone_w_error = 0; + b->never_written = 0; + b->mirror_num = 0; + b->next_in_same_bio = NULL; + b->orig_bio_private = NULL; + b->orig_bio_end_io = NULL; + INIT_LIST_HEAD(&b->collision_resolving_node); + INIT_LIST_HEAD(&b->all_blocks_node); + INIT_LIST_HEAD(&b->ref_to_list); + INIT_LIST_HEAD(&b->ref_from_list); + b->submit_bio_bh_rw = 0; + b->flush_gen = 0; +} + +static struct btrfsic_block *btrfsic_block_alloc(void) +{ + struct btrfsic_block *b; + + b = kzalloc(sizeof(*b), GFP_NOFS); + if (NULL != b) + btrfsic_block_init(b); + + return b; +} + +static void btrfsic_block_free(struct btrfsic_block *b) +{ + BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num)); + kfree(b); +} + +static void btrfsic_block_link_init(struct btrfsic_block_link *l) +{ + l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER; + l->ref_cnt = 1; + INIT_LIST_HEAD(&l->node_ref_to); + INIT_LIST_HEAD(&l->node_ref_from); + INIT_LIST_HEAD(&l->collision_resolving_node); + l->block_ref_to = NULL; + l->block_ref_from = NULL; +} + +static struct btrfsic_block_link *btrfsic_block_link_alloc(void) +{ + struct btrfsic_block_link *l; + + l = kzalloc(sizeof(*l), GFP_NOFS); + if (NULL != l) + btrfsic_block_link_init(l); + + return l; +} + +static void btrfsic_block_link_free(struct btrfsic_block_link *l) +{ + BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num)); + kfree(l); +} + +static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds) +{ + ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER; + ds->bdev = NULL; + ds->state = NULL; + INIT_LIST_HEAD(&ds->collision_resolving_node); + ds->last_flush_gen = 0; + btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush); + ds->dummy_block_for_bio_bh_flush.is_iodone = 1; + ds->dummy_block_for_bio_bh_flush.dev_state = ds; +} + +static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void) +{ + struct btrfsic_dev_state *ds; + + ds = kzalloc(sizeof(*ds), GFP_NOFS); + if (NULL != ds) + btrfsic_dev_state_init(ds); + + return ds; +} + +static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds) +{ + BUG_ON(!(NULL == ds || + BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num)); + kfree(ds); +} + +static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_block_hashtable_add(struct btrfsic_block *b, + struct btrfsic_block_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(b->dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)b->dev_state->bdev))) & + (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); + + list_add(&b->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_block_hashtable_remove(struct btrfsic_block *b) +{ + list_del(&b->collision_resolving_node); +} + +static struct btrfsic_block *btrfsic_block_hashtable_lookup( + struct block_device *bdev, + u64 dev_bytenr, + struct btrfsic_block_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)bdev))) & + (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); + struct btrfsic_block *b; + + list_for_each_entry(b, h->table + hashval, collision_resolving_node) { + if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) + return b; + } + + return NULL; +} + +static void btrfsic_block_link_hashtable_init( + struct btrfsic_block_link_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_block_link_hashtable_add( + struct btrfsic_block_link *l, + struct btrfsic_block_link_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^ + ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^ + ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev))) + & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); + + BUG_ON(NULL == l->block_ref_to); + BUG_ON(NULL == l->block_ref_from); + list_add(&l->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l) +{ + list_del(&l->collision_resolving_node); +} + +static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( + struct block_device *bdev_ref_to, + u64 dev_bytenr_ref_to, + struct block_device *bdev_ref_from, + u64 dev_bytenr_ref_from, + struct btrfsic_block_link_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(dev_bytenr_ref_to >> 16)) ^ + ((unsigned int)(dev_bytenr_ref_from >> 16)) ^ + ((unsigned int)((uintptr_t)bdev_ref_to)) ^ + ((unsigned int)((uintptr_t)bdev_ref_from))) & + (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); + struct btrfsic_block_link *l; + + list_for_each_entry(l, h->table + hashval, collision_resolving_node) { + BUG_ON(NULL == l->block_ref_to); + BUG_ON(NULL == l->block_ref_from); + if (l->block_ref_to->dev_state->bdev == bdev_ref_to && + l->block_ref_to->dev_bytenr == dev_bytenr_ref_to && + l->block_ref_from->dev_state->bdev == bdev_ref_from && + l->block_ref_from->dev_bytenr == dev_bytenr_ref_from) + return l; + } + + return NULL; +} + +static void btrfsic_dev_state_hashtable_init( + struct btrfsic_dev_state_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_dev_state_hashtable_add( + struct btrfsic_dev_state *ds, + struct btrfsic_dev_state_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) & + (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); + + list_add(&ds->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds) +{ + list_del(&ds->collision_resolving_node); +} + +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev, + struct btrfsic_dev_state_hashtable *h) +{ + const unsigned int hashval = + dev & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1); + struct btrfsic_dev_state *ds; + + list_for_each_entry(ds, h->table + hashval, collision_resolving_node) { + if (ds->bdev->bd_dev == dev) + return ds; + } + + return NULL; +} + +static int btrfsic_process_superblock(struct btrfsic_state *state, + struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_super_block *selected_super; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + struct btrfsic_dev_state *selected_dev_state = NULL; + int ret = 0; + int pass; + + selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); + if (!selected_super) + return -ENOMEM; + + list_for_each_entry(device, dev_head, dev_list) { + int i; + struct btrfsic_dev_state *dev_state; + + if (!device->bdev || !device->name) + continue; + + dev_state = btrfsic_dev_state_lookup(device->bdev->bd_dev); + BUG_ON(NULL == dev_state); + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + ret = btrfsic_process_superblock_dev_mirror( + state, dev_state, device, i, + &selected_dev_state, selected_super); + if (0 != ret && 0 == i) { + kfree(selected_super); + return ret; + } + } + } + + if (NULL == state->latest_superblock) { + pr_info("btrfsic: no superblock found!\n"); + kfree(selected_super); + return -1; + } + + for (pass = 0; pass < 3; pass++) { + int num_copies; + int mirror_num; + u64 next_bytenr; + + switch (pass) { + case 0: + next_bytenr = btrfs_super_root(selected_super); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + pr_info("root@%llu\n", next_bytenr); + break; + case 1: + next_bytenr = btrfs_super_chunk_root(selected_super); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + pr_info("chunk@%llu\n", next_bytenr); + break; + case 2: + next_bytenr = btrfs_super_log_root(selected_super); + if (0 == next_bytenr) + continue; + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + pr_info("log@%llu\n", next_bytenr); + break; + } + + num_copies = btrfs_num_copies(state->fs_info, next_bytenr, + state->metablock_size); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + pr_info("num_copies(log_bytenr=%llu) = %d\n", + next_bytenr, num_copies); + + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + + ret = btrfsic_map_block(state, next_bytenr, + state->metablock_size, + &tmp_next_block_ctx, + mirror_num); + if (ret) { + pr_info("btrfsic: btrfsic_map_block(root @%llu, mirror %d) failed!\n", + next_bytenr, mirror_num); + kfree(selected_super); + return -1; + } + + next_block = btrfsic_block_hashtable_lookup( + tmp_next_block_ctx.dev->bdev, + tmp_next_block_ctx.dev_bytenr, + &state->block_hashtable); + BUG_ON(NULL == next_block); + + l = btrfsic_block_link_hashtable_lookup( + tmp_next_block_ctx.dev->bdev, + tmp_next_block_ctx.dev_bytenr, + state->latest_superblock->dev_state-> + bdev, + state->latest_superblock->dev_bytenr, + &state->block_link_hashtable); + BUG_ON(NULL == l); + + ret = btrfsic_read_block(state, &tmp_next_block_ctx); + if (ret < (int)PAGE_SIZE) { + pr_info("btrfsic: read @logical %llu failed!\n", + tmp_next_block_ctx.start); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + kfree(selected_super); + return -1; + } + + ret = btrfsic_process_metablock(state, + next_block, + &tmp_next_block_ctx, + BTRFS_MAX_LEVEL + 3, 1); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + } + } + + kfree(selected_super); + return ret; +} + +static int btrfsic_process_superblock_dev_mirror( + struct btrfsic_state *state, + struct btrfsic_dev_state *dev_state, + struct btrfs_device *device, + int superblock_mirror_num, + struct btrfsic_dev_state **selected_dev_state, + struct btrfs_super_block *selected_super) +{ + struct btrfs_fs_info *fs_info = state->fs_info; + struct btrfs_super_block *super_tmp; + u64 dev_bytenr; + struct btrfsic_block *superblock_tmp; + int pass; + struct block_device *const superblock_bdev = device->bdev; + struct page *page; + struct address_space *mapping = superblock_bdev->bd_inode->i_mapping; + int ret = 0; + + /* super block bytenr is always the unmapped device bytenr */ + dev_bytenr = btrfs_sb_offset(superblock_mirror_num); + if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes) + return -1; + + page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS); + if (IS_ERR(page)) + return -1; + + super_tmp = page_address(page); + + if (btrfs_super_bytenr(super_tmp) != dev_bytenr || + btrfs_super_magic(super_tmp) != BTRFS_MAGIC || + memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || + btrfs_super_nodesize(super_tmp) != state->metablock_size || + btrfs_super_sectorsize(super_tmp) != state->datablock_size) { + ret = 0; + goto out; + } + + superblock_tmp = + btrfsic_block_hashtable_lookup(superblock_bdev, + dev_bytenr, + &state->block_hashtable); + if (NULL == superblock_tmp) { + superblock_tmp = btrfsic_block_alloc(); + if (NULL == superblock_tmp) { + ret = -1; + goto out; + } + /* for superblock, only the dev_bytenr makes sense */ + superblock_tmp->dev_bytenr = dev_bytenr; + superblock_tmp->dev_state = dev_state; + superblock_tmp->logical_bytenr = dev_bytenr; + superblock_tmp->generation = btrfs_super_generation(super_tmp); + superblock_tmp->is_metadata = 1; + superblock_tmp->is_superblock = 1; + superblock_tmp->is_iodone = 1; + superblock_tmp->never_written = 0; + superblock_tmp->mirror_num = 1 + superblock_mirror_num; + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + btrfs_info_in_rcu(fs_info, + "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)", + superblock_bdev, + btrfs_dev_name(device), dev_bytenr, + dev_state->bdev, dev_bytenr, + superblock_mirror_num); + list_add(&superblock_tmp->all_blocks_node, + &state->all_blocks_list); + btrfsic_block_hashtable_add(superblock_tmp, + &state->block_hashtable); + } + + /* select the one with the highest generation field */ + if (btrfs_super_generation(super_tmp) > + state->max_superblock_generation || + 0 == state->max_superblock_generation) { + memcpy(selected_super, super_tmp, sizeof(*selected_super)); + *selected_dev_state = dev_state; + state->max_superblock_generation = + btrfs_super_generation(super_tmp); + state->latest_superblock = superblock_tmp; + } + + for (pass = 0; pass < 3; pass++) { + u64 next_bytenr; + int num_copies; + int mirror_num; + const char *additional_string = NULL; + struct btrfs_disk_key tmp_disk_key; + + tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_disk_key.offset = 0; + switch (pass) { + case 0: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_ROOT_TREE_OBJECTID); + additional_string = "initial root "; + next_bytenr = btrfs_super_root(super_tmp); + break; + case 1: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_CHUNK_TREE_OBJECTID); + additional_string = "initial chunk "; + next_bytenr = btrfs_super_chunk_root(super_tmp); + break; + case 2: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_TREE_LOG_OBJECTID); + additional_string = "initial log "; + next_bytenr = btrfs_super_log_root(super_tmp); + if (0 == next_bytenr) + continue; + break; + } + + num_copies = btrfs_num_copies(fs_info, next_bytenr, + state->metablock_size); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + pr_info("num_copies(log_bytenr=%llu) = %d\n", + next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + + if (btrfsic_map_block(state, next_bytenr, + state->metablock_size, + &tmp_next_block_ctx, + mirror_num)) { + pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n", + next_bytenr, mirror_num); + ret = -1; + goto out; + } + + next_block = btrfsic_block_lookup_or_add( + state, &tmp_next_block_ctx, + additional_string, 1, 1, 0, + mirror_num, NULL); + if (NULL == next_block) { + btrfsic_release_block_ctx(&tmp_next_block_ctx); + ret = -1; + goto out; + } + + next_block->disk_key = tmp_disk_key; + next_block->generation = BTRFSIC_GENERATION_UNKNOWN; + l = btrfsic_block_link_lookup_or_add( + state, &tmp_next_block_ctx, + next_block, superblock_tmp, + BTRFSIC_GENERATION_UNKNOWN); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + if (NULL == l) { + ret = -1; + goto out; + } + } + } + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES) + btrfsic_dump_tree_sub(state, superblock_tmp, 0); + +out: + put_page(page); + return ret; +} + +static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) +{ + struct btrfsic_stack_frame *sf; + + sf = kzalloc(sizeof(*sf), GFP_NOFS); + if (sf) + sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER; + return sf; +} + +static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf) +{ + BUG_ON(!(NULL == sf || + BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic)); + kfree(sf); +} + +static noinline_for_stack int btrfsic_process_metablock( + struct btrfsic_state *state, + struct btrfsic_block *const first_block, + struct btrfsic_block_data_ctx *const first_block_ctx, + int first_limit_nesting, int force_iodone_flag) +{ + struct btrfsic_stack_frame initial_stack_frame = { 0 }; + struct btrfsic_stack_frame *sf; + struct btrfsic_stack_frame *next_stack; + struct btrfs_header *const first_hdr = + (struct btrfs_header *)first_block_ctx->datav[0]; + + BUG_ON(!first_hdr); + sf = &initial_stack_frame; + sf->error = 0; + sf->i = -1; + sf->limit_nesting = first_limit_nesting; + sf->block = first_block; + sf->block_ctx = first_block_ctx; + sf->next_block = NULL; + sf->hdr = first_hdr; + sf->prev = NULL; + +continue_with_new_stack_frame: + sf->block->generation = btrfs_stack_header_generation(sf->hdr); + if (0 == sf->hdr->level) { + struct btrfs_leaf *const leafhdr = + (struct btrfs_leaf *)sf->hdr; + + if (-1 == sf->i) { + sf->nr = btrfs_stack_header_nritems(&leafhdr->header); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + pr_info("leaf %llu items %d generation %llu owner %llu\n", + sf->block_ctx->start, sf->nr, + btrfs_stack_header_generation( + &leafhdr->header), + btrfs_stack_header_owner( + &leafhdr->header)); + } + +continue_with_current_leaf_stack_frame: + if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { + sf->i++; + sf->num_copies = 0; + } + + if (sf->i < sf->nr) { + struct btrfs_item disk_item; + u32 disk_item_offset = + (uintptr_t)(leafhdr->items + sf->i) - + (uintptr_t)leafhdr; + struct btrfs_disk_key *disk_key; + u8 type; + u32 item_offset; + u32 item_size; + + if (disk_item_offset + sizeof(struct btrfs_item) > + sf->block_ctx->len) { +leaf_item_out_of_bounce_error: + pr_info( + "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n", + sf->block_ctx->start, + sf->block_ctx->dev->bdev); + goto one_stack_frame_backwards; + } + btrfsic_read_from_block_data(sf->block_ctx, + &disk_item, + disk_item_offset, + sizeof(struct btrfs_item)); + item_offset = btrfs_stack_item_offset(&disk_item); + item_size = btrfs_stack_item_size(&disk_item); + disk_key = &disk_item.key; + type = btrfs_disk_key_type(disk_key); + + if (BTRFS_ROOT_ITEM_KEY == type) { + struct btrfs_root_item root_item; + u32 root_item_offset; + u64 next_bytenr; + + root_item_offset = item_offset + + offsetof(struct btrfs_leaf, items); + if (root_item_offset + item_size > + sf->block_ctx->len) + goto leaf_item_out_of_bounce_error; + btrfsic_read_from_block_data( + sf->block_ctx, &root_item, + root_item_offset, + item_size); + next_bytenr = btrfs_root_bytenr(&root_item); + + sf->error = + btrfsic_create_link_to_next_block( + state, + sf->block, + sf->block_ctx, + next_bytenr, + sf->limit_nesting, + &sf->next_block_ctx, + &sf->next_block, + force_iodone_flag, + &sf->num_copies, + &sf->mirror_num, + disk_key, + btrfs_root_generation( + &root_item)); + if (sf->error) + goto one_stack_frame_backwards; + + if (NULL != sf->next_block) { + struct btrfs_header *const next_hdr = + (struct btrfs_header *) + sf->next_block_ctx.datav[0]; + + next_stack = + btrfsic_stack_frame_alloc(); + if (NULL == next_stack) { + sf->error = -1; + btrfsic_release_block_ctx( + &sf-> + next_block_ctx); + goto one_stack_frame_backwards; + } + + next_stack->i = -1; + next_stack->block = sf->next_block; + next_stack->block_ctx = + &sf->next_block_ctx; + next_stack->next_block = NULL; + next_stack->hdr = next_hdr; + next_stack->limit_nesting = + sf->limit_nesting - 1; + next_stack->prev = sf; + sf = next_stack; + goto continue_with_new_stack_frame; + } + } else if (BTRFS_EXTENT_DATA_KEY == type && + state->include_extent_data) { + sf->error = btrfsic_handle_extent_data( + state, + sf->block, + sf->block_ctx, + item_offset, + force_iodone_flag); + if (sf->error) + goto one_stack_frame_backwards; + } + + goto continue_with_current_leaf_stack_frame; + } + } else { + struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr; + + if (-1 == sf->i) { + sf->nr = btrfs_stack_header_nritems(&nodehdr->header); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + pr_info("node %llu level %d items %d generation %llu owner %llu\n", + sf->block_ctx->start, + nodehdr->header.level, sf->nr, + btrfs_stack_header_generation( + &nodehdr->header), + btrfs_stack_header_owner( + &nodehdr->header)); + } + +continue_with_current_node_stack_frame: + if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { + sf->i++; + sf->num_copies = 0; + } + + if (sf->i < sf->nr) { + struct btrfs_key_ptr key_ptr; + u32 key_ptr_offset; + u64 next_bytenr; + + key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) - + (uintptr_t)nodehdr; + if (key_ptr_offset + sizeof(struct btrfs_key_ptr) > + sf->block_ctx->len) { + pr_info( + "btrfsic: node item out of bounce at logical %llu, dev %pg\n", + sf->block_ctx->start, + sf->block_ctx->dev->bdev); + goto one_stack_frame_backwards; + } + btrfsic_read_from_block_data( + sf->block_ctx, &key_ptr, key_ptr_offset, + sizeof(struct btrfs_key_ptr)); + next_bytenr = btrfs_stack_key_blockptr(&key_ptr); + + sf->error = btrfsic_create_link_to_next_block( + state, + sf->block, + sf->block_ctx, + next_bytenr, + sf->limit_nesting, + &sf->next_block_ctx, + &sf->next_block, + force_iodone_flag, + &sf->num_copies, + &sf->mirror_num, + &key_ptr.key, + btrfs_stack_key_generation(&key_ptr)); + if (sf->error) + goto one_stack_frame_backwards; + + if (NULL != sf->next_block) { + struct btrfs_header *const next_hdr = + (struct btrfs_header *) + sf->next_block_ctx.datav[0]; + + next_stack = btrfsic_stack_frame_alloc(); + if (NULL == next_stack) { + sf->error = -1; + goto one_stack_frame_backwards; + } + + next_stack->i = -1; + next_stack->block = sf->next_block; + next_stack->block_ctx = &sf->next_block_ctx; + next_stack->next_block = NULL; + next_stack->hdr = next_hdr; + next_stack->limit_nesting = + sf->limit_nesting - 1; + next_stack->prev = sf; + sf = next_stack; + goto continue_with_new_stack_frame; + } + + goto continue_with_current_node_stack_frame; + } + } + +one_stack_frame_backwards: + if (NULL != sf->prev) { + struct btrfsic_stack_frame *const prev = sf->prev; + + /* the one for the initial block is freed in the caller */ + btrfsic_release_block_ctx(sf->block_ctx); + + if (sf->error) { + prev->error = sf->error; + btrfsic_stack_frame_free(sf); + sf = prev; + goto one_stack_frame_backwards; + } + + btrfsic_stack_frame_free(sf); + sf = prev; + goto continue_with_new_stack_frame; + } else { + BUG_ON(&initial_stack_frame != sf); + } + + return sf->error; +} + +static void btrfsic_read_from_block_data( + struct btrfsic_block_data_ctx *block_ctx, + void *dstv, u32 offset, size_t len) +{ + size_t cur; + size_t pgoff; + char *kaddr; + char *dst = (char *)dstv; + size_t start_offset = offset_in_page(block_ctx->start); + unsigned long i = (start_offset + offset) >> PAGE_SHIFT; + + WARN_ON(offset + len > block_ctx->len); + pgoff = offset_in_page(start_offset + offset); + + while (len > 0) { + cur = min(len, ((size_t)PAGE_SIZE - pgoff)); + BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE)); + kaddr = block_ctx->datav[i]; + memcpy(dst, kaddr + pgoff, cur); + + dst += cur; + len -= cur; + pgoff = 0; + i++; + } +} + +static int btrfsic_create_link_to_next_block( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u64 next_bytenr, + int limit_nesting, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block **next_blockp, + int force_iodone_flag, + int *num_copiesp, int *mirror_nump, + struct btrfs_disk_key *disk_key, + u64 parent_generation) +{ + struct btrfs_fs_info *fs_info = state->fs_info; + struct btrfsic_block *next_block = NULL; + int ret; + struct btrfsic_block_link *l; + int did_alloc_block_link; + int block_was_created; + + *next_blockp = NULL; + if (0 == *num_copiesp) { + *num_copiesp = btrfs_num_copies(fs_info, next_bytenr, + state->metablock_size); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + pr_info("num_copies(log_bytenr=%llu) = %d\n", + next_bytenr, *num_copiesp); + *mirror_nump = 1; + } + + if (*mirror_nump > *num_copiesp) + return 0; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + pr_info("btrfsic_create_link_to_next_block(mirror_num=%d)\n", + *mirror_nump); + ret = btrfsic_map_block(state, next_bytenr, + state->metablock_size, + next_block_ctx, *mirror_nump); + if (ret) { + pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", + next_bytenr, *mirror_nump); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + next_block = btrfsic_block_lookup_or_add(state, + next_block_ctx, "referenced ", + 1, force_iodone_flag, + !force_iodone_flag, + *mirror_nump, + &block_was_created); + if (NULL == next_block) { + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + if (block_was_created) { + l = NULL; + next_block->generation = BTRFSIC_GENERATION_UNKNOWN; + } else { + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { + if (next_block->logical_bytenr != next_bytenr && + !(!next_block->is_metadata && + 0 == next_block->logical_bytenr)) + pr_info( +"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n", + next_bytenr, next_block_ctx->dev->bdev, + next_block_ctx->dev_bytenr, *mirror_nump, + btrfsic_get_block_type(state, + next_block), + next_block->logical_bytenr); + else + pr_info( + "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n", + next_bytenr, next_block_ctx->dev->bdev, + next_block_ctx->dev_bytenr, *mirror_nump, + btrfsic_get_block_type(state, + next_block)); + } + next_block->logical_bytenr = next_bytenr; + + next_block->mirror_num = *mirror_nump; + l = btrfsic_block_link_hashtable_lookup( + next_block_ctx->dev->bdev, + next_block_ctx->dev_bytenr, + block_ctx->dev->bdev, + block_ctx->dev_bytenr, + &state->block_link_hashtable); + } + + next_block->disk_key = *disk_key; + if (NULL == l) { + l = btrfsic_block_link_alloc(); + if (NULL == l) { + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + did_alloc_block_link = 1; + l->block_ref_to = next_block; + l->block_ref_from = block; + l->ref_cnt = 1; + l->parent_generation = parent_generation; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + + list_add(&l->node_ref_to, &block->ref_to_list); + list_add(&l->node_ref_from, &next_block->ref_from_list); + + btrfsic_block_link_hashtable_add(l, + &state->block_link_hashtable); + } else { + did_alloc_block_link = 0; + if (0 == limit_nesting) { + l->ref_cnt++; + l->parent_generation = parent_generation; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + } + } + + if (limit_nesting > 0 && did_alloc_block_link) { + ret = btrfsic_read_block(state, next_block_ctx); + if (ret < (int)next_block_ctx->len) { + pr_info("btrfsic: read block @logical %llu failed!\n", + next_bytenr); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + *next_blockp = next_block; + } else { + *next_blockp = NULL; + } + (*mirror_nump)++; + + return 0; +} + +static int btrfsic_handle_extent_data( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u32 item_offset, int force_iodone_flag) +{ + struct btrfs_fs_info *fs_info = state->fs_info; + struct btrfs_file_extent_item file_extent_item; + u64 file_extent_item_offset; + u64 next_bytenr; + u64 num_bytes; + u64 generation; + struct btrfsic_block_link *l; + int ret; + + file_extent_item_offset = offsetof(struct btrfs_leaf, items) + + item_offset; + if (file_extent_item_offset + + offsetof(struct btrfs_file_extent_item, disk_num_bytes) > + block_ctx->len) { + pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n", + block_ctx->start, block_ctx->dev->bdev); + return -1; + } + + btrfsic_read_from_block_data(block_ctx, &file_extent_item, + file_extent_item_offset, + offsetof(struct btrfs_file_extent_item, disk_num_bytes)); + if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || + btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) { + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + pr_info("extent_data: type %u, disk_bytenr = %llu\n", + file_extent_item.type, + btrfs_stack_file_extent_disk_bytenr( + &file_extent_item)); + return 0; + } + + if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > + block_ctx->len) { + pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n", + block_ctx->start, block_ctx->dev->bdev); + return -1; + } + btrfsic_read_from_block_data(block_ctx, &file_extent_item, + file_extent_item_offset, + sizeof(struct btrfs_file_extent_item)); + next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item); + if (btrfs_stack_file_extent_compression(&file_extent_item) == + BTRFS_COMPRESS_NONE) { + next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item); + num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); + } else { + num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item); + } + generation = btrfs_stack_file_extent_generation(&file_extent_item); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + pr_info("extent_data: type %u, disk_bytenr = %llu, offset = %llu, num_bytes = %llu\n", + file_extent_item.type, + btrfs_stack_file_extent_disk_bytenr(&file_extent_item), + btrfs_stack_file_extent_offset(&file_extent_item), + num_bytes); + while (num_bytes > 0) { + u32 chunk_len; + int num_copies; + int mirror_num; + + if (num_bytes > state->datablock_size) + chunk_len = state->datablock_size; + else + chunk_len = num_bytes; + + num_copies = btrfs_num_copies(fs_info, next_bytenr, + state->datablock_size); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + pr_info("num_copies(log_bytenr=%llu) = %d\n", + next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block_data_ctx next_block_ctx; + struct btrfsic_block *next_block; + int block_was_created; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + pr_info("btrfsic_handle_extent_data(mirror_num=%d)\n", + mirror_num); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + pr_info("\tdisk_bytenr = %llu, num_bytes %u\n", + next_bytenr, chunk_len); + ret = btrfsic_map_block(state, next_bytenr, + chunk_len, &next_block_ctx, + mirror_num); + if (ret) { + pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", + next_bytenr, mirror_num); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, + &next_block_ctx, + "referenced ", + 0, + force_iodone_flag, + !force_iodone_flag, + mirror_num, + &block_was_created); + if (NULL == next_block) { + btrfsic_release_block_ctx(&next_block_ctx); + return -1; + } + if (!block_was_created) { + if ((state->print_mask & + BTRFSIC_PRINT_MASK_VERBOSE) && + next_block->logical_bytenr != next_bytenr && + !(!next_block->is_metadata && + 0 == next_block->logical_bytenr)) { + pr_info( +"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n", + next_bytenr, + next_block_ctx.dev->bdev, + next_block_ctx.dev_bytenr, + mirror_num, + next_block->logical_bytenr); + } + next_block->logical_bytenr = next_bytenr; + next_block->mirror_num = mirror_num; + } + + l = btrfsic_block_link_lookup_or_add(state, + &next_block_ctx, + next_block, block, + generation); + btrfsic_release_block_ctx(&next_block_ctx); + if (NULL == l) + return -1; + } + + next_bytenr += chunk_len; + num_bytes -= chunk_len; + } + + return 0; +} + +static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, + struct btrfsic_block_data_ctx *block_ctx_out, + int mirror_num) +{ + struct btrfs_fs_info *fs_info = state->fs_info; + int ret; + u64 length; + struct btrfs_io_context *bioc = NULL; + struct btrfs_io_stripe smap, *map; + struct btrfs_device *device; + + length = len; + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, bytenr, &length, &bioc, + NULL, &mirror_num, 0); + if (ret) { + block_ctx_out->start = 0; + block_ctx_out->dev_bytenr = 0; + block_ctx_out->len = 0; + block_ctx_out->dev = NULL; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; + + return ret; + } + + if (bioc) + map = &bioc->stripes[0]; + else + map = &smap; + + device = map->dev; + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || + !device->bdev || !device->name) + block_ctx_out->dev = NULL; + else + block_ctx_out->dev = btrfsic_dev_state_lookup( + device->bdev->bd_dev); + block_ctx_out->dev_bytenr = map->physical; + block_ctx_out->start = bytenr; + block_ctx_out->len = len; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; + + kfree(bioc); + if (NULL == block_ctx_out->dev) { + ret = -ENXIO; + pr_info("btrfsic: error, cannot lookup dev (#1)!\n"); + } + + return ret; +} + +static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) +{ + if (block_ctx->mem_to_free) { + unsigned int num_pages; + + BUG_ON(!block_ctx->datav); + BUG_ON(!block_ctx->pagev); + num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> + PAGE_SHIFT; + /* Pages must be unmapped in reverse order */ + while (num_pages > 0) { + num_pages--; + if (block_ctx->datav[num_pages]) + block_ctx->datav[num_pages] = NULL; + if (block_ctx->pagev[num_pages]) { + __free_page(block_ctx->pagev[num_pages]); + block_ctx->pagev[num_pages] = NULL; + } + } + + kfree(block_ctx->mem_to_free); + block_ctx->mem_to_free = NULL; + block_ctx->pagev = NULL; + block_ctx->datav = NULL; + } +} + +static int btrfsic_read_block(struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx) +{ + unsigned int num_pages; + unsigned int i; + size_t size; + u64 dev_bytenr; + int ret; + + BUG_ON(block_ctx->datav); + BUG_ON(block_ctx->pagev); + BUG_ON(block_ctx->mem_to_free); + if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) { + pr_info("btrfsic: read_block() with unaligned bytenr %llu\n", + block_ctx->dev_bytenr); + return -1; + } + + num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> + PAGE_SHIFT; + size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev); + block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS); + if (!block_ctx->mem_to_free) + return -ENOMEM; + block_ctx->datav = block_ctx->mem_to_free; + block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); + ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev); + if (ret) + return ret; + + dev_bytenr = block_ctx->dev_bytenr; + for (i = 0; i < num_pages;) { + struct bio *bio; + unsigned int j; + + bio = bio_alloc(block_ctx->dev->bdev, num_pages - i, + REQ_OP_READ, GFP_NOFS); + bio->bi_iter.bi_sector = dev_bytenr >> SECTOR_SHIFT; + + for (j = i; j < num_pages; j++) { + ret = bio_add_page(bio, block_ctx->pagev[j], + PAGE_SIZE, 0); + if (PAGE_SIZE != ret) + break; + } + if (j == i) { + pr_info("btrfsic: error, failed to add a single page!\n"); + return -1; + } + if (submit_bio_wait(bio)) { + pr_info("btrfsic: read error at logical %llu dev %pg!\n", + block_ctx->start, block_ctx->dev->bdev); + bio_put(bio); + return -1; + } + bio_put(bio); + dev_bytenr += (j - i) * PAGE_SIZE; + i = j; + } + for (i = 0; i < num_pages; i++) + block_ctx->datav[i] = page_address(block_ctx->pagev[i]); + + return block_ctx->len; +} + +static void btrfsic_dump_database(struct btrfsic_state *state) +{ + const struct btrfsic_block *b_all; + + BUG_ON(NULL == state); + + pr_info("all_blocks_list:\n"); + list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) { + const struct btrfsic_block_link *l; + + pr_info("%c-block @%llu (%pg/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + b_all->logical_bytenr, b_all->dev_state->bdev, + b_all->dev_bytenr, b_all->mirror_num); + + list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) { + pr_info( + " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + b_all->logical_bytenr, b_all->dev_state->bdev, + b_all->dev_bytenr, b_all->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->bdev, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + } + + list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) { + pr_info( + " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + b_all->logical_bytenr, b_all->dev_state->bdev, + b_all->dev_bytenr, b_all->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->bdev, + l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num); + } + + pr_info("\n"); + } +} + +/* + * Test whether the disk block contains a tree block (leaf or node) + * (note that this test fails for the super block) + */ +static noinline_for_stack int btrfsic_test_for_metadata( + struct btrfsic_state *state, + char **datav, unsigned int num_pages) +{ + struct btrfs_fs_info *fs_info = state->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + struct btrfs_header *h; + u8 csum[BTRFS_CSUM_SIZE]; + unsigned int i; + + if (num_pages * PAGE_SIZE < state->metablock_size) + return 1; /* not metadata */ + num_pages = state->metablock_size >> PAGE_SHIFT; + h = (struct btrfs_header *)datav[0]; + + if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE)) + return 1; + + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + + for (i = 0; i < num_pages; i++) { + u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); + size_t sublen = i ? PAGE_SIZE : + (PAGE_SIZE - BTRFS_CSUM_SIZE); + + crypto_shash_update(shash, data, sublen); + } + crypto_shash_final(shash, csum); + if (memcmp(csum, h->csum, fs_info->csum_size)) + return 1; + + return 0; /* is metadata */ +} + +static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, char **mapped_datav, + unsigned int num_pages, + struct bio *bio, int *bio_is_patched, + blk_opf_t submit_bio_bh_rw) +{ + int is_metadata; + struct btrfsic_block *block; + struct btrfsic_block_data_ctx block_ctx; + int ret; + struct btrfsic_state *state = dev_state->state; + struct block_device *bdev = dev_state->bdev; + unsigned int processed_len; + + if (NULL != bio_is_patched) + *bio_is_patched = 0; + +again: + if (num_pages == 0) + return; + + processed_len = 0; + is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav, + num_pages)); + + block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, + &state->block_hashtable); + if (NULL != block) { + u64 bytenr = 0; + struct btrfsic_block_link *l, *tmp; + + if (block->is_superblock) { + bytenr = btrfs_super_bytenr((struct btrfs_super_block *) + mapped_datav[0]); + if (num_pages * PAGE_SIZE < + BTRFS_SUPER_INFO_SIZE) { + pr_info("btrfsic: cannot work with too short bios!\n"); + return; + } + is_metadata = 1; + BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE)); + processed_len = BTRFS_SUPER_INFO_SIZE; + if (state->print_mask & + BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { + pr_info("[before new superblock is written]:\n"); + btrfsic_dump_tree_sub(state, block, 0); + } + } + if (is_metadata) { + if (!block->is_superblock) { + if (num_pages * PAGE_SIZE < + state->metablock_size) { + pr_info("btrfsic: cannot work with too short bios!\n"); + return; + } + processed_len = state->metablock_size; + bytenr = btrfs_stack_header_bytenr( + (struct btrfs_header *) + mapped_datav[0]); + btrfsic_cmp_log_and_dev_bytenr(state, bytenr, + dev_state, + dev_bytenr); + } + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { + if (block->logical_bytenr != bytenr && + !(!block->is_metadata && + block->logical_bytenr == 0)) + pr_info( +"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n", + bytenr, dev_state->bdev, + dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, + block), + block->logical_bytenr); + else + pr_info( + "written block @%llu (%pg/%llu/%d) found in hash table, %c\n", + bytenr, dev_state->bdev, + dev_bytenr, block->mirror_num, + btrfsic_get_block_type(state, + block)); + } + block->logical_bytenr = bytenr; + } else { + if (num_pages * PAGE_SIZE < + state->datablock_size) { + pr_info("btrfsic: cannot work with too short bios!\n"); + return; + } + processed_len = state->datablock_size; + bytenr = block->logical_bytenr; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + pr_info( + "written block @%llu (%pg/%llu/%d) found in hash table, %c\n", + bytenr, dev_state->bdev, dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, block)); + } + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + pr_info("ref_to_list: %cE, ref_from_list: %cE\n", + list_empty(&block->ref_to_list) ? ' ' : '!', + list_empty(&block->ref_from_list) ? ' ' : '!'); + if (btrfsic_is_block_ref_by_superblock(state, block, 0)) { + pr_info( +"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n", + btrfsic_get_block_type(state, block), bytenr, + dev_state->bdev, dev_bytenr, block->mirror_num, + block->generation, + btrfs_disk_key_objectid(&block->disk_key), + block->disk_key.type, + btrfs_disk_key_offset(&block->disk_key), + btrfs_stack_header_generation( + (struct btrfs_header *) mapped_datav[0]), + state->max_superblock_generation); + btrfsic_dump_tree(state); + } + + if (!block->is_iodone && !block->never_written) { + pr_info( +"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n", + btrfsic_get_block_type(state, block), bytenr, + dev_state->bdev, dev_bytenr, block->mirror_num, + block->generation, + btrfs_stack_header_generation( + (struct btrfs_header *) + mapped_datav[0])); + /* it would not be safe to go on */ + btrfsic_dump_tree(state); + goto continue_loop; + } + + /* + * Clear all references of this block. Do not free + * the block itself even if is not referenced anymore + * because it still carries valuable information + * like whether it was ever written and IO completed. + */ + list_for_each_entry_safe(l, tmp, &block->ref_to_list, + node_ref_to) { + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_rem_link(state, l); + l->ref_cnt--; + if (0 == l->ref_cnt) { + list_del(&l->node_ref_to); + list_del(&l->node_ref_from); + btrfsic_block_link_hashtable_remove(l); + btrfsic_block_link_free(l); + } + } + + block_ctx.dev = dev_state; + block_ctx.dev_bytenr = dev_bytenr; + block_ctx.start = bytenr; + block_ctx.len = processed_len; + block_ctx.pagev = NULL; + block_ctx.mem_to_free = NULL; + block_ctx.datav = mapped_datav; + + if (is_metadata || state->include_extent_data) { + block->never_written = 0; + block->iodone_w_error = 0; + if (NULL != bio) { + block->is_iodone = 0; + BUG_ON(NULL == bio_is_patched); + if (!*bio_is_patched) { + block->orig_bio_private = + bio->bi_private; + block->orig_bio_end_io = + bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + *bio_is_patched = 1; + } else { + struct btrfsic_block *chained_block = + (struct btrfsic_block *) + bio->bi_private; + + BUG_ON(NULL == chained_block); + block->orig_bio_private = + chained_block->orig_bio_private; + block->orig_bio_end_io = + chained_block->orig_bio_end_io; + block->next_in_same_bio = chained_block; + bio->bi_private = block; + } + } else { + block->is_iodone = 1; + block->orig_bio_private = NULL; + block->orig_bio_end_io = NULL; + block->next_in_same_bio = NULL; + } + } + + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = submit_bio_bh_rw; + if (is_metadata) { + block->logical_bytenr = bytenr; + block->is_metadata = 1; + if (block->is_superblock) { + BUG_ON(PAGE_SIZE != + BTRFS_SUPER_INFO_SIZE); + ret = btrfsic_process_written_superblock( + state, + block, + (struct btrfs_super_block *) + mapped_datav[0]); + if (state->print_mask & + BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { + pr_info("[after new superblock is written]:\n"); + btrfsic_dump_tree_sub(state, block, 0); + } + } else { + block->mirror_num = 0; /* unknown */ + ret = btrfsic_process_metablock( + state, + block, + &block_ctx, + 0, 0); + } + if (ret) + pr_info("btrfsic: btrfsic_process_metablock(root @%llu) failed!\n", + dev_bytenr); + } else { + block->is_metadata = 0; + block->mirror_num = 0; /* unknown */ + block->generation = BTRFSIC_GENERATION_UNKNOWN; + if (!state->include_extent_data + && list_empty(&block->ref_from_list)) { + /* + * disk block is overwritten with extent + * data (not meta data) and we are configured + * to not include extent data: take the + * chance and free the block's memory + */ + btrfsic_block_hashtable_remove(block); + list_del(&block->all_blocks_node); + btrfsic_block_free(block); + } + } + btrfsic_release_block_ctx(&block_ctx); + } else { + /* block has not been found in hash table */ + u64 bytenr; + + if (!is_metadata) { + processed_len = state->datablock_size; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + pr_info( + "written block (%pg/%llu/?) !found in hash table, D\n", + dev_state->bdev, dev_bytenr); + if (!state->include_extent_data) { + /* ignore that written D block */ + goto continue_loop; + } + + /* this is getting ugly for the + * include_extent_data case... */ + bytenr = 0; /* unknown */ + } else { + processed_len = state->metablock_size; + bytenr = btrfs_stack_header_bytenr( + (struct btrfs_header *) + mapped_datav[0]); + btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, + dev_bytenr); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + pr_info( + "written block @%llu (%pg/%llu/?) !found in hash table, M\n", + bytenr, dev_state->bdev, dev_bytenr); + } + + block_ctx.dev = dev_state; + block_ctx.dev_bytenr = dev_bytenr; + block_ctx.start = bytenr; + block_ctx.len = processed_len; + block_ctx.pagev = NULL; + block_ctx.mem_to_free = NULL; + block_ctx.datav = mapped_datav; + + block = btrfsic_block_alloc(); + if (NULL == block) { + btrfsic_release_block_ctx(&block_ctx); + goto continue_loop; + } + block->dev_state = dev_state; + block->dev_bytenr = dev_bytenr; + block->logical_bytenr = bytenr; + block->is_metadata = is_metadata; + block->never_written = 0; + block->iodone_w_error = 0; + block->mirror_num = 0; /* unknown */ + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = submit_bio_bh_rw; + if (NULL != bio) { + block->is_iodone = 0; + BUG_ON(NULL == bio_is_patched); + if (!*bio_is_patched) { + block->orig_bio_private = bio->bi_private; + block->orig_bio_end_io = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + *bio_is_patched = 1; + } else { + struct btrfsic_block *chained_block = + (struct btrfsic_block *) + bio->bi_private; + + BUG_ON(NULL == chained_block); + block->orig_bio_private = + chained_block->orig_bio_private; + block->orig_bio_end_io = + chained_block->orig_bio_end_io; + block->next_in_same_bio = chained_block; + bio->bi_private = block; + } + } else { + block->is_iodone = 1; + block->orig_bio_private = NULL; + block->orig_bio_end_io = NULL; + block->next_in_same_bio = NULL; + } + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + pr_info("new written %c-block @%llu (%pg/%llu/%d)\n", + is_metadata ? 'M' : 'D', + block->logical_bytenr, block->dev_state->bdev, + block->dev_bytenr, block->mirror_num); + list_add(&block->all_blocks_node, &state->all_blocks_list); + btrfsic_block_hashtable_add(block, &state->block_hashtable); + + if (is_metadata) { + ret = btrfsic_process_metablock(state, block, + &block_ctx, 0, 0); + if (ret) + pr_info("btrfsic: process_metablock(root @%llu) failed!\n", + dev_bytenr); + } + btrfsic_release_block_ctx(&block_ctx); + } + +continue_loop: + BUG_ON(!processed_len); + dev_bytenr += processed_len; + mapped_datav += processed_len >> PAGE_SHIFT; + num_pages -= processed_len >> PAGE_SHIFT; + goto again; +} + +static void btrfsic_bio_end_io(struct bio *bp) +{ + struct btrfsic_block *block = bp->bi_private; + int iodone_w_error; + + /* mutex is not held! This is not save if IO is not yet completed + * on umount */ + iodone_w_error = 0; + if (bp->bi_status) + iodone_w_error = 1; + + BUG_ON(NULL == block); + bp->bi_private = block->orig_bio_private; + bp->bi_end_io = block->orig_bio_end_io; + + do { + struct btrfsic_block *next_block; + struct btrfsic_dev_state *const dev_state = block->dev_state; + + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n", + bp->bi_status, + btrfsic_get_block_type(dev_state->state, block), + block->logical_bytenr, dev_state->bdev, + block->dev_bytenr, block->mirror_num); + next_block = block->next_in_same_bio; + block->iodone_w_error = iodone_w_error; + if (block->submit_bio_bh_rw & REQ_PREFLUSH) { + dev_state->last_flush_gen++; + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + pr_info("bio_end_io() new %pg flush_gen=%llu\n", + dev_state->bdev, + dev_state->last_flush_gen); + } + if (block->submit_bio_bh_rw & REQ_FUA) + block->flush_gen = 0; /* FUA completed means block is + * on disk */ + block->is_iodone = 1; /* for FLUSH, this releases the block */ + block = next_block; + } while (NULL != block); + + bp->bi_end_io(bp); +} + +static int btrfsic_process_written_superblock( + struct btrfsic_state *state, + struct btrfsic_block *const superblock, + struct btrfs_super_block *const super_hdr) +{ + struct btrfs_fs_info *fs_info = state->fs_info; + int pass; + + superblock->generation = btrfs_super_generation(super_hdr); + if (!(superblock->generation > state->max_superblock_generation || + 0 == state->max_superblock_generation)) { + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + pr_info( + "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n", + superblock->logical_bytenr, + superblock->dev_state->bdev, + superblock->dev_bytenr, superblock->mirror_num, + btrfs_super_generation(super_hdr), + state->max_superblock_generation); + } else { + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + pr_info( + "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n", + superblock->logical_bytenr, + superblock->dev_state->bdev, + superblock->dev_bytenr, superblock->mirror_num, + btrfs_super_generation(super_hdr), + state->max_superblock_generation); + + state->max_superblock_generation = + btrfs_super_generation(super_hdr); + state->latest_superblock = superblock; + } + + for (pass = 0; pass < 3; pass++) { + int ret; + u64 next_bytenr; + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + int num_copies; + int mirror_num; + const char *additional_string = NULL; + struct btrfs_disk_key tmp_disk_key = {0}; + + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_ROOT_ITEM_KEY); + btrfs_set_disk_key_objectid(&tmp_disk_key, 0); + + switch (pass) { + case 0: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_ROOT_TREE_OBJECTID); + additional_string = "root "; + next_bytenr = btrfs_super_root(super_hdr); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + pr_info("root@%llu\n", next_bytenr); + break; + case 1: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_CHUNK_TREE_OBJECTID); + additional_string = "chunk "; + next_bytenr = btrfs_super_chunk_root(super_hdr); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + pr_info("chunk@%llu\n", next_bytenr); + break; + case 2: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_TREE_LOG_OBJECTID); + additional_string = "log "; + next_bytenr = btrfs_super_log_root(super_hdr); + if (0 == next_bytenr) + continue; + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + pr_info("log@%llu\n", next_bytenr); + break; + } + + num_copies = btrfs_num_copies(fs_info, next_bytenr, + BTRFS_SUPER_INFO_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + pr_info("num_copies(log_bytenr=%llu) = %d\n", + next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + int was_created; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + pr_info("btrfsic_process_written_superblock(mirror_num=%d)\n", mirror_num); + ret = btrfsic_map_block(state, next_bytenr, + BTRFS_SUPER_INFO_SIZE, + &tmp_next_block_ctx, + mirror_num); + if (ret) { + pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", + next_bytenr, mirror_num); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, + &tmp_next_block_ctx, + additional_string, + 1, 0, 1, + mirror_num, + &was_created); + if (NULL == next_block) { + btrfsic_release_block_ctx(&tmp_next_block_ctx); + return -1; + } + + next_block->disk_key = tmp_disk_key; + if (was_created) + next_block->generation = + BTRFSIC_GENERATION_UNKNOWN; + l = btrfsic_block_link_lookup_or_add( + state, + &tmp_next_block_ctx, + next_block, + superblock, + BTRFSIC_GENERATION_UNKNOWN); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + if (NULL == l) + return -1; + } + } + + if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0))) + btrfsic_dump_tree(state); + + return 0; +} + +static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, + struct btrfsic_block *const block, + int recursion_level) +{ + const struct btrfsic_block_link *l; + int ret = 0; + + if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { + /* + * Note that this situation can happen and does not + * indicate an error in regular cases. It happens + * when disk blocks are freed and later reused. + * The check-integrity module is not aware of any + * block free operations, it just recognizes block + * write operations. Therefore it keeps the linkage + * information for a block until a block is + * rewritten. This can temporarily cause incorrect + * and even circular linkage information. This + * causes no harm unless such blocks are referenced + * by the most recent super block. + */ + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + pr_info("btrfsic: abort cyclic linkage (case 1).\n"); + + return ret; + } + + /* + * This algorithm is recursive because the amount of used stack + * space is very small and the max recursion depth is limited. + */ + list_for_each_entry(l, &block->ref_to_list, node_ref_to) { + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + pr_info( + "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n", + recursion_level, + btrfsic_get_block_type(state, block), + block->logical_bytenr, block->dev_state->bdev, + block->dev_bytenr, block->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->bdev, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + if (l->block_ref_to->never_written) { + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n", + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->bdev, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; + } else if (!l->block_ref_to->is_iodone) { + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n", + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->bdev, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; + } else if (l->block_ref_to->iodone_w_error) { + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n", + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->bdev, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; + } else if (l->parent_generation != + l->block_ref_to->generation && + BTRFSIC_GENERATION_UNKNOWN != + l->parent_generation && + BTRFSIC_GENERATION_UNKNOWN != + l->block_ref_to->generation) { + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n", + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->bdev, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num, + l->block_ref_to->generation, + l->parent_generation); + ret = -1; + } else if (l->block_ref_to->flush_gen > + l->block_ref_to->dev_state->last_flush_gen) { + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n", + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->bdev, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num, block->flush_gen, + l->block_ref_to->dev_state->last_flush_gen); + ret = -1; + } else if (-1 == btrfsic_check_all_ref_blocks(state, + l->block_ref_to, + recursion_level + + 1)) { + ret = -1; + } + } + + return ret; +} + +static int btrfsic_is_block_ref_by_superblock( + const struct btrfsic_state *state, + const struct btrfsic_block *block, + int recursion_level) +{ + const struct btrfsic_block_link *l; + + if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { + /* refer to comment at "abort cyclic linkage (case 1)" */ + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + pr_info("btrfsic: abort cyclic linkage (case 2).\n"); + + return 0; + } + + /* + * This algorithm is recursive because the amount of used stack space + * is very small and the max recursion depth is limited. + */ + list_for_each_entry(l, &block->ref_from_list, node_ref_from) { + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + pr_info( + "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n", + recursion_level, + btrfsic_get_block_type(state, block), + block->logical_bytenr, block->dev_state->bdev, + block->dev_bytenr, block->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->bdev, + l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num); + if (l->block_ref_from->is_superblock && + state->latest_superblock->dev_bytenr == + l->block_ref_from->dev_bytenr && + state->latest_superblock->dev_state->bdev == + l->block_ref_from->dev_state->bdev) + return 1; + else if (btrfsic_is_block_ref_by_superblock(state, + l->block_ref_from, + recursion_level + + 1)) + return 1; + } + + return 0; +} + +static void btrfsic_print_add_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l) +{ + pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n", + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->bdev, + l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); +} + +static void btrfsic_print_rem_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l) +{ + pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n", + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->bdev, + l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); +} + +static char btrfsic_get_block_type(const struct btrfsic_state *state, + const struct btrfsic_block *block) +{ + if (block->is_superblock && + state->latest_superblock->dev_bytenr == block->dev_bytenr && + state->latest_superblock->dev_state->bdev == block->dev_state->bdev) + return 'S'; + else if (block->is_superblock) + return 's'; + else if (block->is_metadata) + return 'M'; + else + return 'D'; +} + +static void btrfsic_dump_tree(const struct btrfsic_state *state) +{ + btrfsic_dump_tree_sub(state, state->latest_superblock, 0); +} + +static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int indent_level) +{ + const struct btrfsic_block_link *l; + int indent_add; + static char buf[80]; + int cursor_position; + + /* + * Should better fill an on-stack buffer with a complete line and + * dump it at once when it is time to print a newline character. + */ + + /* + * This algorithm is recursive because the amount of used stack space + * is very small and the max recursion depth is limited. + */ + indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)", + btrfsic_get_block_type(state, block), + block->logical_bytenr, block->dev_state->bdev, + block->dev_bytenr, block->mirror_num); + if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { + printk("[...]\n"); + return; + } + printk(buf); + indent_level += indent_add; + if (list_empty(&block->ref_to_list)) { + printk("\n"); + return; + } + if (block->mirror_num > 1 && + !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) { + printk(" [...]\n"); + return; + } + + cursor_position = indent_level; + list_for_each_entry(l, &block->ref_to_list, node_ref_to) { + while (cursor_position < indent_level) { + printk(" "); + cursor_position++; + } + if (l->ref_cnt > 1) + indent_add = sprintf(buf, " %d*--> ", l->ref_cnt); + else + indent_add = sprintf(buf, " --> "); + if (indent_level + indent_add > + BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { + printk("[...]\n"); + cursor_position = 0; + continue; + } + + printk(buf); + + btrfsic_dump_tree_sub(state, l->block_ref_to, + indent_level + indent_add); + cursor_position = 0; + } +} + +static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block *next_block, + struct btrfsic_block *from_block, + u64 parent_generation) +{ + struct btrfsic_block_link *l; + + l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev, + next_block_ctx->dev_bytenr, + from_block->dev_state->bdev, + from_block->dev_bytenr, + &state->block_link_hashtable); + if (NULL == l) { + l = btrfsic_block_link_alloc(); + if (!l) + return NULL; + + l->block_ref_to = next_block; + l->block_ref_from = from_block; + l->ref_cnt = 1; + l->parent_generation = parent_generation; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + + list_add(&l->node_ref_to, &from_block->ref_to_list); + list_add(&l->node_ref_from, &next_block->ref_from_list); + + btrfsic_block_link_hashtable_add(l, + &state->block_link_hashtable); + } else { + l->ref_cnt++; + l->parent_generation = parent_generation; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + } + + return l; +} + +static struct btrfsic_block *btrfsic_block_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx, + const char *additional_string, + int is_metadata, + int is_iodone, + int never_written, + int mirror_num, + int *was_created) +{ + struct btrfsic_block *block; + + block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev, + block_ctx->dev_bytenr, + &state->block_hashtable); + if (NULL == block) { + struct btrfsic_dev_state *dev_state; + + block = btrfsic_block_alloc(); + if (!block) + return NULL; + + dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev); + if (NULL == dev_state) { + pr_info("btrfsic: error, lookup dev_state failed!\n"); + btrfsic_block_free(block); + return NULL; + } + block->dev_state = dev_state; + block->dev_bytenr = block_ctx->dev_bytenr; + block->logical_bytenr = block_ctx->start; + block->is_metadata = is_metadata; + block->is_iodone = is_iodone; + block->never_written = never_written; + block->mirror_num = mirror_num; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n", + additional_string, + btrfsic_get_block_type(state, block), + block->logical_bytenr, dev_state->bdev, + block->dev_bytenr, mirror_num); + list_add(&block->all_blocks_node, &state->all_blocks_list); + btrfsic_block_hashtable_add(block, &state->block_hashtable); + if (NULL != was_created) + *was_created = 1; + } else { + if (NULL != was_created) + *was_created = 0; + } + + return block; +} + +static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, + u64 bytenr, + struct btrfsic_dev_state *dev_state, + u64 dev_bytenr) +{ + struct btrfs_fs_info *fs_info = state->fs_info; + struct btrfsic_block_data_ctx block_ctx; + int num_copies; + int mirror_num; + int match = 0; + int ret; + + num_copies = btrfs_num_copies(fs_info, bytenr, state->metablock_size); + + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + ret = btrfsic_map_block(state, bytenr, state->metablock_size, + &block_ctx, mirror_num); + if (ret) { + pr_info("btrfsic: btrfsic_map_block(logical @%llu, mirror %d) failed!\n", + bytenr, mirror_num); + continue; + } + + if (dev_state->bdev == block_ctx.dev->bdev && + dev_bytenr == block_ctx.dev_bytenr) { + match++; + btrfsic_release_block_ctx(&block_ctx); + break; + } + btrfsic_release_block_ctx(&block_ctx); + } + + if (WARN_ON(!match)) { + pr_info( +"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n", + bytenr, dev_state->bdev, dev_bytenr); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + ret = btrfsic_map_block(state, bytenr, + state->metablock_size, + &block_ctx, mirror_num); + if (ret) + continue; + + pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n", + bytenr, block_ctx.dev->bdev, + block_ctx.dev_bytenr, mirror_num); + } + } +} + +static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev) +{ + return btrfsic_dev_state_hashtable_lookup(dev, + &btrfsic_dev_state_hashtable); +} + +static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) +{ + unsigned int segs = bio_segments(bio); + u64 dev_bytenr = 512 * bio->bi_iter.bi_sector; + u64 cur_bytenr = dev_bytenr; + struct bvec_iter iter; + struct bio_vec bvec; + char **mapped_datav; + int bio_is_patched = 0; + int i = 0; + + if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + pr_info( +"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", + bio_op(bio), bio->bi_opf, segs, + bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); + + mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); + if (!mapped_datav) + return; + + bio_for_each_segment(bvec, bio, iter) { + BUG_ON(bvec.bv_len != PAGE_SIZE); + mapped_datav[i] = page_address(bvec.bv_page); + i++; + + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) + pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", + i, cur_bytenr, bvec.bv_len, bvec.bv_offset); + cur_bytenr += bvec.bv_len; + } + + btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs, + bio, &bio_is_patched, bio->bi_opf); + kfree(mapped_datav); +} + +static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) +{ + if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", + bio_op(bio), bio->bi_opf, bio->bi_bdev); + + if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = bio->bi_opf; + block->orig_bio_private = bio->bi_private; + block->orig_bio_end_io = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + } else if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) { + pr_info( +"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", + dev_state->bdev); + } +} + +void btrfsic_check_bio(struct bio *bio) +{ + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) + return; + + /* + * We can be called before btrfsic_mount, so there might not be a + * dev_state. + */ + dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); + mutex_lock(&btrfsic_mutex); + if (dev_state) { + if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio)) + btrfsic_check_write_bio(bio, dev_state); + else if (bio->bi_opf & REQ_PREFLUSH) + btrfsic_check_flush_bio(bio, dev_state); + } + mutex_unlock(&btrfsic_mutex); +} + +int btrfsic_mount(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices, + int including_extent_data, u32 print_mask) +{ + int ret; + struct btrfsic_state *state; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + + if (!PAGE_ALIGNED(fs_info->nodesize)) { + pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n", + fs_info->nodesize, PAGE_SIZE); + return -1; + } + if (!PAGE_ALIGNED(fs_info->sectorsize)) { + pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n", + fs_info->sectorsize, PAGE_SIZE); + return -1; + } + state = kvzalloc(sizeof(*state), GFP_KERNEL); + if (!state) + return -ENOMEM; + + if (!btrfsic_is_initialized) { + mutex_init(&btrfsic_mutex); + btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable); + btrfsic_is_initialized = 1; + } + mutex_lock(&btrfsic_mutex); + state->fs_info = fs_info; + state->print_mask = print_mask; + state->include_extent_data = including_extent_data; + state->metablock_size = fs_info->nodesize; + state->datablock_size = fs_info->sectorsize; + INIT_LIST_HEAD(&state->all_blocks_list); + btrfsic_block_hashtable_init(&state->block_hashtable); + btrfsic_block_link_hashtable_init(&state->block_link_hashtable); + state->max_superblock_generation = 0; + state->latest_superblock = NULL; + + list_for_each_entry(device, dev_head, dev_list) { + struct btrfsic_dev_state *ds; + + if (!device->bdev || !device->name) + continue; + + ds = btrfsic_dev_state_alloc(); + if (NULL == ds) { + mutex_unlock(&btrfsic_mutex); + return -ENOMEM; + } + ds->bdev = device->bdev; + ds->state = state; + btrfsic_dev_state_hashtable_add(ds, + &btrfsic_dev_state_hashtable); + } + + ret = btrfsic_process_superblock(state, fs_devices); + if (0 != ret) { + mutex_unlock(&btrfsic_mutex); + btrfsic_unmount(fs_devices); + return ret; + } + + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE) + btrfsic_dump_database(state); + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE) + btrfsic_dump_tree(state); + + mutex_unlock(&btrfsic_mutex); + return 0; +} + +void btrfsic_unmount(struct btrfs_fs_devices *fs_devices) +{ + struct btrfsic_block *b_all, *tmp_all; + struct btrfsic_state *state; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + + if (!btrfsic_is_initialized) + return; + + mutex_lock(&btrfsic_mutex); + + state = NULL; + list_for_each_entry(device, dev_head, dev_list) { + struct btrfsic_dev_state *ds; + + if (!device->bdev || !device->name) + continue; + + ds = btrfsic_dev_state_hashtable_lookup( + device->bdev->bd_dev, + &btrfsic_dev_state_hashtable); + if (NULL != ds) { + state = ds->state; + btrfsic_dev_state_hashtable_remove(ds); + btrfsic_dev_state_free(ds); + } + } + + if (NULL == state) { + pr_info("btrfsic: error, cannot find state information on umount!\n"); + mutex_unlock(&btrfsic_mutex); + return; + } + + /* + * Don't care about keeping the lists' state up to date, + * just free all memory that was allocated dynamically. + * Free the blocks and the block_links. + */ + list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list, + all_blocks_node) { + struct btrfsic_block_link *l, *tmp; + + list_for_each_entry_safe(l, tmp, &b_all->ref_to_list, + node_ref_to) { + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_rem_link(state, l); + + l->ref_cnt--; + if (0 == l->ref_cnt) + btrfsic_block_link_free(l); + } + + if (b_all->is_iodone || b_all->never_written) + btrfsic_block_free(b_all); + else + pr_info( +"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n", + btrfsic_get_block_type(state, b_all), + b_all->logical_bytenr, b_all->dev_state->bdev, + b_all->dev_bytenr, b_all->mirror_num); + } + + mutex_unlock(&btrfsic_mutex); + + kvfree(state); +} diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h new file mode 100644 index 0000000000..e4c8aed799 --- /dev/null +++ b/fs/btrfs/check-integrity.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) STRATO AG 2011. All rights reserved. + */ + +#ifndef BTRFS_CHECK_INTEGRITY_H +#define BTRFS_CHECK_INTEGRITY_H + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY +void btrfsic_check_bio(struct bio *bio); +#else +static inline void btrfsic_check_bio(struct bio *bio) { } +#endif + +int btrfsic_mount(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices, + int including_extent_data, u32 print_mask); +void btrfsic_unmount(struct btrfs_fs_devices *fs_devices); + +#endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c new file mode 100644 index 0000000000..8818ed5c39 --- /dev/null +++ b/fs/btrfs/compression.c @@ -0,0 +1,1464 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2008 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "misc.h" +#include "ctree.h" +#include "fs.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "bio.h" +#include "ordered-data.h" +#include "compression.h" +#include "extent_io.h" +#include "extent_map.h" +#include "subpage.h" +#include "zoned.h" +#include "file-item.h" +#include "super.h" + +static struct bio_set btrfs_compressed_bioset; + +static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; + +const char* btrfs_compress_type2str(enum btrfs_compression_type type) +{ + switch (type) { + case BTRFS_COMPRESS_ZLIB: + case BTRFS_COMPRESS_LZO: + case BTRFS_COMPRESS_ZSTD: + case BTRFS_COMPRESS_NONE: + return btrfs_compress_types[type]; + default: + break; + } + + return NULL; +} + +static inline struct compressed_bio *to_compressed_bio(struct btrfs_bio *bbio) +{ + return container_of(bbio, struct compressed_bio, bbio); +} + +static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode, + u64 start, blk_opf_t op, + btrfs_bio_end_io_t end_io) +{ + struct btrfs_bio *bbio; + + bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op, + GFP_NOFS, &btrfs_compressed_bioset)); + btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL); + bbio->inode = inode; + bbio->file_offset = start; + return to_compressed_bio(bbio); +} + +bool btrfs_compress_is_valid_type(const char *str, size_t len) +{ + int i; + + for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) { + size_t comp_len = strlen(btrfs_compress_types[i]); + + if (len < comp_len) + continue; + + if (!strncmp(btrfs_compress_types[i], str, comp_len)) + return true; + } + return false; +} + +static int compression_compress_pages(int type, struct list_head *ws, + struct address_space *mapping, u64 start, struct page **pages, + unsigned long *out_pages, unsigned long *total_in, + unsigned long *total_out) +{ + switch (type) { + case BTRFS_COMPRESS_ZLIB: + return zlib_compress_pages(ws, mapping, start, pages, + out_pages, total_in, total_out); + case BTRFS_COMPRESS_LZO: + return lzo_compress_pages(ws, mapping, start, pages, + out_pages, total_in, total_out); + case BTRFS_COMPRESS_ZSTD: + return zstd_compress_pages(ws, mapping, start, pages, + out_pages, total_in, total_out); + case BTRFS_COMPRESS_NONE: + default: + /* + * This can happen when compression races with remount setting + * it to 'no compress', while caller doesn't call + * inode_need_compress() to check if we really need to + * compress. + * + * Not a big deal, just need to inform caller that we + * haven't allocated any pages yet. + */ + *out_pages = 0; + return -E2BIG; + } +} + +static int compression_decompress_bio(struct list_head *ws, + struct compressed_bio *cb) +{ + switch (cb->compress_type) { + case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb); + case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb); + case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb); + case BTRFS_COMPRESS_NONE: + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } +} + +static int compression_decompress(int type, struct list_head *ws, + const u8 *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen) +{ + switch (type) { + case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, + start_byte, srclen, destlen); + case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, + start_byte, srclen, destlen); + case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, + start_byte, srclen, destlen); + case BTRFS_COMPRESS_NONE: + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } +} + +static void btrfs_free_compressed_pages(struct compressed_bio *cb) +{ + for (unsigned int i = 0; i < cb->nr_pages; i++) + put_page(cb->compressed_pages[i]); + kfree(cb->compressed_pages); +} + +static int btrfs_decompress_bio(struct compressed_bio *cb); + +static void end_compressed_bio_read(struct btrfs_bio *bbio) +{ + struct compressed_bio *cb = to_compressed_bio(bbio); + blk_status_t status = bbio->bio.bi_status; + + if (!status) + status = errno_to_blk_status(btrfs_decompress_bio(cb)); + + btrfs_free_compressed_pages(cb); + btrfs_bio_end_io(cb->orig_bbio, status); + bio_put(&bbio->bio); +} + +/* + * Clear the writeback bits on all of the file + * pages for a compressed write + */ +static noinline void end_compressed_writeback(const struct compressed_bio *cb) +{ + struct inode *inode = &cb->bbio.inode->vfs_inode; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + unsigned long index = cb->start >> PAGE_SHIFT; + unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; + struct folio_batch fbatch; + const int errno = blk_status_to_errno(cb->bbio.bio.bi_status); + int i; + int ret; + + if (errno) + mapping_set_error(inode->i_mapping, errno); + + folio_batch_init(&fbatch); + while (index <= end_index) { + ret = filemap_get_folios(inode->i_mapping, &index, end_index, + &fbatch); + + if (ret == 0) + return; + + for (i = 0; i < ret; i++) { + struct folio *folio = fbatch.folios[i]; + + btrfs_page_clamp_clear_writeback(fs_info, &folio->page, + cb->start, cb->len); + } + folio_batch_release(&fbatch); + } + /* the inode may be gone now */ +} + +static void btrfs_finish_compressed_write_work(struct work_struct *work) +{ + struct compressed_bio *cb = + container_of(work, struct compressed_bio, write_end_work); + + btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len, + cb->bbio.bio.bi_status == BLK_STS_OK); + + if (cb->writeback) + end_compressed_writeback(cb); + /* Note, our inode could be gone now */ + + btrfs_free_compressed_pages(cb); + bio_put(&cb->bbio.bio); +} + +/* + * Do the cleanup once all the compressed pages hit the disk. This will clear + * writeback on the file pages and free the compressed pages. + * + * This also calls the writeback end hooks for the file pages so that metadata + * and checksums can be updated in the file. + */ +static void end_compressed_bio_write(struct btrfs_bio *bbio) +{ + struct compressed_bio *cb = to_compressed_bio(bbio); + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + + queue_work(fs_info->compressed_write_workers, &cb->write_end_work); +} + +static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb) +{ + struct bio *bio = &cb->bbio.bio; + u32 offset = 0; + + while (offset < cb->compressed_len) { + u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE); + + /* Maximum compressed extent is smaller than bio size limit. */ + __bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT], + len, 0); + offset += len; + } +} + +/* + * worker function to build and submit bios for previously compressed pages. + * The corresponding pages in the inode should be marked for writeback + * and the compressed pages should have a reference on them for dropping + * when the IO is complete. + * + * This also checksums the file bytes and gets things ready for + * the end io hooks. + */ +void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, + struct page **compressed_pages, + unsigned int nr_pages, + blk_opf_t write_flags, + bool writeback) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct compressed_bio *cb; + + ASSERT(IS_ALIGNED(ordered->file_offset, fs_info->sectorsize)); + ASSERT(IS_ALIGNED(ordered->num_bytes, fs_info->sectorsize)); + + cb = alloc_compressed_bio(inode, ordered->file_offset, + REQ_OP_WRITE | write_flags, + end_compressed_bio_write); + cb->start = ordered->file_offset; + cb->len = ordered->num_bytes; + cb->compressed_pages = compressed_pages; + cb->compressed_len = ordered->disk_num_bytes; + cb->writeback = writeback; + INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); + cb->nr_pages = nr_pages; + cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; + cb->bbio.ordered = ordered; + btrfs_add_compressed_bio_pages(cb); + + btrfs_submit_bio(&cb->bbio, 0); +} + +/* + * Add extra pages in the same compressed file extent so that we don't need to + * re-read the same extent again and again. + * + * NOTE: this won't work well for subpage, as for subpage read, we lock the + * full page then submit bio for each compressed/regular extents. + * + * This means, if we have several sectors in the same page points to the same + * on-disk compressed data, we will re-read the same extent many times and + * this function can only help for the next page. + */ +static noinline int add_ra_bio_pages(struct inode *inode, + u64 compressed_end, + struct compressed_bio *cb, + int *memstall, unsigned long *pflags) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + unsigned long end_index; + struct bio *orig_bio = &cb->orig_bbio->bio; + u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; + u64 isize = i_size_read(inode); + int ret; + struct page *page; + struct extent_map *em; + struct address_space *mapping = inode->i_mapping; + struct extent_map_tree *em_tree; + struct extent_io_tree *tree; + int sectors_missed = 0; + + em_tree = &BTRFS_I(inode)->extent_tree; + tree = &BTRFS_I(inode)->io_tree; + + if (isize == 0) + return 0; + + /* + * For current subpage support, we only support 64K page size, + * which means maximum compressed extent size (128K) is just 2x page + * size. + * This makes readahead less effective, so here disable readahead for + * subpage for now, until full compressed write is supported. + */ + if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE) + return 0; + + end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; + + while (cur < compressed_end) { + u64 page_end; + u64 pg_index = cur >> PAGE_SHIFT; + u32 add_size; + + if (pg_index > end_index) + break; + + page = xa_load(&mapping->i_pages, pg_index); + if (page && !xa_is_value(page)) { + sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >> + fs_info->sectorsize_bits; + + /* Beyond threshold, no need to continue */ + if (sectors_missed > 4) + break; + + /* + * Jump to next page start as we already have page for + * current offset. + */ + cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + continue; + } + + page = __page_cache_alloc(mapping_gfp_constraint(mapping, + ~__GFP_FS)); + if (!page) + break; + + if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { + put_page(page); + /* There is already a page, skip to page end */ + cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + continue; + } + + if (!*memstall && PageWorkingset(page)) { + psi_memstall_enter(pflags); + *memstall = 1; + } + + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + break; + } + + page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1; + lock_extent(tree, cur, page_end, NULL); + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); + read_unlock(&em_tree->lock); + + /* + * At this point, we have a locked page in the page cache for + * these bytes in the file. But, we have to make sure they map + * to this compressed extent on disk. + */ + if (!em || cur < em->start || + (cur + fs_info->sectorsize > extent_map_end(em)) || + (em->block_start >> SECTOR_SHIFT) != orig_bio->bi_iter.bi_sector) { + free_extent_map(em); + unlock_extent(tree, cur, page_end, NULL); + unlock_page(page); + put_page(page); + break; + } + free_extent_map(em); + + if (page->index == end_index) { + size_t zero_offset = offset_in_page(isize); + + if (zero_offset) { + int zeros; + zeros = PAGE_SIZE - zero_offset; + memzero_page(page, zero_offset, zeros); + } + } + + add_size = min(em->start + em->len, page_end + 1) - cur; + ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur)); + if (ret != add_size) { + unlock_extent(tree, cur, page_end, NULL); + unlock_page(page); + put_page(page); + break; + } + /* + * If it's subpage, we also need to increase its + * subpage::readers number, as at endio we will decrease + * subpage::readers and to unlock the page. + */ + if (fs_info->sectorsize < PAGE_SIZE) + btrfs_subpage_start_reader(fs_info, page, cur, add_size); + put_page(page); + cur += add_size; + } + return 0; +} + +/* + * for a compressed read, the bio we get passed has all the inode pages + * in it. We don't actually do IO on those pages but allocate new ones + * to hold the compressed pages on disk. + * + * bio->bi_iter.bi_sector points to the compressed extent on disk + * bio->bi_io_vec points to all of the inode pages + * + * After the compressed pages are read, we copy the bytes into the + * bio we were passed and then call the bio end_io calls + */ +void btrfs_submit_compressed_read(struct btrfs_bio *bbio) +{ + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map_tree *em_tree = &inode->extent_tree; + struct compressed_bio *cb; + unsigned int compressed_len; + u64 file_offset = bbio->file_offset; + u64 em_len; + u64 em_start; + struct extent_map *em; + unsigned long pflags; + int memstall = 0; + blk_status_t ret; + int ret2; + + /* we need the actual starting offset of this extent in the file */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); + read_unlock(&em_tree->lock); + if (!em) { + ret = BLK_STS_IOERR; + goto out; + } + + ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); + compressed_len = em->block_len; + + cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ, + end_compressed_bio_read); + + cb->start = em->orig_start; + em_len = em->len; + em_start = em->start; + + cb->len = bbio->bio.bi_iter.bi_size; + cb->compressed_len = compressed_len; + cb->compress_type = em->compress_type; + cb->orig_bbio = bbio; + + free_extent_map(em); + + cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); + cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS); + if (!cb->compressed_pages) { + ret = BLK_STS_RESOURCE; + goto out_free_bio; + } + + ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages); + if (ret2) { + ret = BLK_STS_RESOURCE; + goto out_free_compressed_pages; + } + + add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall, + &pflags); + + /* include any pages we added in add_ra-bio_pages */ + cb->len = bbio->bio.bi_iter.bi_size; + cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector; + btrfs_add_compressed_bio_pages(cb); + + if (memstall) + psi_memstall_leave(&pflags); + + btrfs_submit_bio(&cb->bbio, 0); + return; + +out_free_compressed_pages: + kfree(cb->compressed_pages); +out_free_bio: + bio_put(&cb->bbio.bio); +out: + btrfs_bio_end_io(bbio, ret); +} + +/* + * Heuristic uses systematic sampling to collect data from the input data + * range, the logic can be tuned by the following constants: + * + * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample + * @SAMPLING_INTERVAL - range from which the sampled data can be collected + */ +#define SAMPLING_READ_SIZE (16) +#define SAMPLING_INTERVAL (256) + +/* + * For statistical analysis of the input data we consider bytes that form a + * Galois Field of 256 objects. Each object has an attribute count, ie. how + * many times the object appeared in the sample. + */ +#define BUCKET_SIZE (256) + +/* + * The size of the sample is based on a statistical sampling rule of thumb. + * The common way is to perform sampling tests as long as the number of + * elements in each cell is at least 5. + * + * Instead of 5, we choose 32 to obtain more accurate results. + * If the data contain the maximum number of symbols, which is 256, we obtain a + * sample size bound by 8192. + * + * For a sample of at most 8KB of data per data range: 16 consecutive bytes + * from up to 512 locations. + */ +#define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \ + SAMPLING_READ_SIZE / SAMPLING_INTERVAL) + +struct bucket_item { + u32 count; +}; + +struct heuristic_ws { + /* Partial copy of input data */ + u8 *sample; + u32 sample_size; + /* Buckets store counters for each byte value */ + struct bucket_item *bucket; + /* Sorting buffer */ + struct bucket_item *bucket_b; + struct list_head list; +}; + +static struct workspace_manager heuristic_wsm; + +static void free_heuristic_ws(struct list_head *ws) +{ + struct heuristic_ws *workspace; + + workspace = list_entry(ws, struct heuristic_ws, list); + + kvfree(workspace->sample); + kfree(workspace->bucket); + kfree(workspace->bucket_b); + kfree(workspace); +} + +static struct list_head *alloc_heuristic_ws(unsigned int level) +{ + struct heuristic_ws *ws; + + ws = kzalloc(sizeof(*ws), GFP_KERNEL); + if (!ws) + return ERR_PTR(-ENOMEM); + + ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL); + if (!ws->sample) + goto fail; + + ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL); + if (!ws->bucket) + goto fail; + + ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL); + if (!ws->bucket_b) + goto fail; + + INIT_LIST_HEAD(&ws->list); + return &ws->list; +fail: + free_heuristic_ws(&ws->list); + return ERR_PTR(-ENOMEM); +} + +const struct btrfs_compress_op btrfs_heuristic_compress = { + .workspace_manager = &heuristic_wsm, +}; + +static const struct btrfs_compress_op * const btrfs_compress_op[] = { + /* The heuristic is represented as compression type 0 */ + &btrfs_heuristic_compress, + &btrfs_zlib_compress, + &btrfs_lzo_compress, + &btrfs_zstd_compress, +}; + +static struct list_head *alloc_workspace(int type, unsigned int level) +{ + switch (type) { + case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level); + case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level); + case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level); + case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level); + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } +} + +static void free_workspace(int type, struct list_head *ws) +{ + switch (type) { + case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws); + case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws); + case BTRFS_COMPRESS_LZO: return lzo_free_workspace(ws); + case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws); + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } +} + +static void btrfs_init_workspace_manager(int type) +{ + struct workspace_manager *wsm; + struct list_head *workspace; + + wsm = btrfs_compress_op[type]->workspace_manager; + INIT_LIST_HEAD(&wsm->idle_ws); + spin_lock_init(&wsm->ws_lock); + atomic_set(&wsm->total_ws, 0); + init_waitqueue_head(&wsm->ws_wait); + + /* + * Preallocate one workspace for each compression type so we can + * guarantee forward progress in the worst case + */ + workspace = alloc_workspace(type, 0); + if (IS_ERR(workspace)) { + pr_warn( + "BTRFS: cannot preallocate compression workspace, will try later\n"); + } else { + atomic_set(&wsm->total_ws, 1); + wsm->free_ws = 1; + list_add(workspace, &wsm->idle_ws); + } +} + +static void btrfs_cleanup_workspace_manager(int type) +{ + struct workspace_manager *wsman; + struct list_head *ws; + + wsman = btrfs_compress_op[type]->workspace_manager; + while (!list_empty(&wsman->idle_ws)) { + ws = wsman->idle_ws.next; + list_del(ws); + free_workspace(type, ws); + atomic_dec(&wsman->total_ws); + } +} + +/* + * This finds an available workspace or allocates a new one. + * If it's not possible to allocate a new one, waits until there's one. + * Preallocation makes a forward progress guarantees and we do not return + * errors. + */ +struct list_head *btrfs_get_workspace(int type, unsigned int level) +{ + struct workspace_manager *wsm; + struct list_head *workspace; + int cpus = num_online_cpus(); + unsigned nofs_flag; + struct list_head *idle_ws; + spinlock_t *ws_lock; + atomic_t *total_ws; + wait_queue_head_t *ws_wait; + int *free_ws; + + wsm = btrfs_compress_op[type]->workspace_manager; + idle_ws = &wsm->idle_ws; + ws_lock = &wsm->ws_lock; + total_ws = &wsm->total_ws; + ws_wait = &wsm->ws_wait; + free_ws = &wsm->free_ws; + +again: + spin_lock(ws_lock); + if (!list_empty(idle_ws)) { + workspace = idle_ws->next; + list_del(workspace); + (*free_ws)--; + spin_unlock(ws_lock); + return workspace; + + } + if (atomic_read(total_ws) > cpus) { + DEFINE_WAIT(wait); + + spin_unlock(ws_lock); + prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(total_ws) > cpus && !*free_ws) + schedule(); + finish_wait(ws_wait, &wait); + goto again; + } + atomic_inc(total_ws); + spin_unlock(ws_lock); + + /* + * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have + * to turn it off here because we might get called from the restricted + * context of btrfs_compress_bio/btrfs_compress_pages + */ + nofs_flag = memalloc_nofs_save(); + workspace = alloc_workspace(type, level); + memalloc_nofs_restore(nofs_flag); + + if (IS_ERR(workspace)) { + atomic_dec(total_ws); + wake_up(ws_wait); + + /* + * Do not return the error but go back to waiting. There's a + * workspace preallocated for each type and the compression + * time is bounded so we get to a workspace eventually. This + * makes our caller's life easier. + * + * To prevent silent and low-probability deadlocks (when the + * initial preallocation fails), check if there are any + * workspaces at all. + */ + if (atomic_read(total_ws) == 0) { + static DEFINE_RATELIMIT_STATE(_rs, + /* once per minute */ 60 * HZ, + /* no burst */ 1); + + if (__ratelimit(&_rs)) { + pr_warn("BTRFS: no compression workspaces, low memory, retrying\n"); + } + } + goto again; + } + return workspace; +} + +static struct list_head *get_workspace(int type, int level) +{ + switch (type) { + case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level); + case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level); + case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level); + case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level); + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } +} + +/* + * put a workspace struct back on the list or free it if we have enough + * idle ones sitting around + */ +void btrfs_put_workspace(int type, struct list_head *ws) +{ + struct workspace_manager *wsm; + struct list_head *idle_ws; + spinlock_t *ws_lock; + atomic_t *total_ws; + wait_queue_head_t *ws_wait; + int *free_ws; + + wsm = btrfs_compress_op[type]->workspace_manager; + idle_ws = &wsm->idle_ws; + ws_lock = &wsm->ws_lock; + total_ws = &wsm->total_ws; + ws_wait = &wsm->ws_wait; + free_ws = &wsm->free_ws; + + spin_lock(ws_lock); + if (*free_ws <= num_online_cpus()) { + list_add(ws, idle_ws); + (*free_ws)++; + spin_unlock(ws_lock); + goto wake; + } + spin_unlock(ws_lock); + + free_workspace(type, ws); + atomic_dec(total_ws); +wake: + cond_wake_up(ws_wait); +} + +static void put_workspace(int type, struct list_head *ws) +{ + switch (type) { + case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws); + case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws); + case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws); + case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws); + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } +} + +/* + * Adjust @level according to the limits of the compression algorithm or + * fallback to default + */ +static unsigned int btrfs_compress_set_level(int type, unsigned level) +{ + const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + + if (level == 0) + level = ops->default_level; + else + level = min(level, ops->max_level); + + return level; +} + +/* + * Given an address space and start and length, compress the bytes into @pages + * that are allocated on demand. + * + * @type_level is encoded algorithm and level, where level 0 means whatever + * default the algorithm chooses and is opaque here; + * - compression algo are 0-3 + * - the level are bits 4-7 + * + * @out_pages is an in/out parameter, holds maximum number of pages to allocate + * and returns number of actually allocated pages + * + * @total_in is used to return the number of bytes actually read. It + * may be smaller than the input length if we had to exit early because we + * ran out of room in the pages array or because we cross the + * max_out threshold. + * + * @total_out is an in/out parameter, must be set to the input length and will + * be also used to return the total number of compressed bytes + */ +int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, + u64 start, struct page **pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out) +{ + int type = btrfs_compress_type(type_level); + int level = btrfs_compress_level(type_level); + struct list_head *workspace; + int ret; + + level = btrfs_compress_set_level(type, level); + workspace = get_workspace(type, level); + ret = compression_compress_pages(type, workspace, mapping, start, pages, + out_pages, total_in, total_out); + put_workspace(type, workspace); + return ret; +} + +static int btrfs_decompress_bio(struct compressed_bio *cb) +{ + struct list_head *workspace; + int ret; + int type = cb->compress_type; + + workspace = get_workspace(type, 0); + ret = compression_decompress_bio(workspace, cb); + put_workspace(type, workspace); + + if (!ret) + zero_fill_bio(&cb->orig_bbio->bio); + return ret; +} + +/* + * a less complex decompression routine. Our compressed data fits in a + * single page, and we want to read a single page out of it. + * start_byte tells us the offset into the compressed data we're interested in + */ +int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen) +{ + struct list_head *workspace; + int ret; + + workspace = get_workspace(type, 0); + ret = compression_decompress(type, workspace, data_in, dest_page, + start_byte, srclen, destlen); + put_workspace(type, workspace); + + return ret; +} + +int __init btrfs_init_compress(void) +{ + if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE, + offsetof(struct compressed_bio, bbio.bio), + BIOSET_NEED_BVECS)) + return -ENOMEM; + btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); + btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); + btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); + zstd_init_workspace_manager(); + return 0; +} + +void __cold btrfs_exit_compress(void) +{ + btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE); + btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB); + btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO); + zstd_cleanup_workspace_manager(); + bioset_exit(&btrfs_compressed_bioset); +} + +/* + * Copy decompressed data from working buffer to pages. + * + * @buf: The decompressed data buffer + * @buf_len: The decompressed data length + * @decompressed: Number of bytes that are already decompressed inside the + * compressed extent + * @cb: The compressed extent descriptor + * @orig_bio: The original bio that the caller wants to read for + * + * An easier to understand graph is like below: + * + * |<- orig_bio ->| |<- orig_bio->| + * |<------- full decompressed extent ----->| + * |<----------- @cb range ---->| + * | |<-- @buf_len -->| + * |<--- @decompressed --->| + * + * Note that, @cb can be a subpage of the full decompressed extent, but + * @cb->start always has the same as the orig_file_offset value of the full + * decompressed extent. + * + * When reading compressed extent, we have to read the full compressed extent, + * while @orig_bio may only want part of the range. + * Thus this function will ensure only data covered by @orig_bio will be copied + * to. + * + * Return 0 if we have copied all needed contents for @orig_bio. + * Return >0 if we need continue decompress. + */ +int btrfs_decompress_buf2page(const char *buf, u32 buf_len, + struct compressed_bio *cb, u32 decompressed) +{ + struct bio *orig_bio = &cb->orig_bbio->bio; + /* Offset inside the full decompressed extent */ + u32 cur_offset; + + cur_offset = decompressed; + /* The main loop to do the copy */ + while (cur_offset < decompressed + buf_len) { + struct bio_vec bvec; + size_t copy_len; + u32 copy_start; + /* Offset inside the full decompressed extent */ + u32 bvec_offset; + + bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter); + /* + * cb->start may underflow, but subtracting that value can still + * give us correct offset inside the full decompressed extent. + */ + bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start; + + /* Haven't reached the bvec range, exit */ + if (decompressed + buf_len <= bvec_offset) + return 1; + + copy_start = max(cur_offset, bvec_offset); + copy_len = min(bvec_offset + bvec.bv_len, + decompressed + buf_len) - copy_start; + ASSERT(copy_len); + + /* + * Extra range check to ensure we didn't go beyond + * @buf + @buf_len. + */ + ASSERT(copy_start - decompressed < buf_len); + memcpy_to_page(bvec.bv_page, bvec.bv_offset, + buf + copy_start - decompressed, copy_len); + cur_offset += copy_len; + + bio_advance(orig_bio, copy_len); + /* Finished the bio */ + if (!orig_bio->bi_iter.bi_size) + return 0; + } + return 1; +} + +/* + * Shannon Entropy calculation + * + * Pure byte distribution analysis fails to determine compressibility of data. + * Try calculating entropy to estimate the average minimum number of bits + * needed to encode the sampled data. + * + * For convenience, return the percentage of needed bits, instead of amount of + * bits directly. + * + * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy + * and can be compressible with high probability + * + * @ENTROPY_LVL_HIGH - data are not compressible with high probability + * + * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate. + */ +#define ENTROPY_LVL_ACEPTABLE (65) +#define ENTROPY_LVL_HIGH (80) + +/* + * For increasead precision in shannon_entropy calculation, + * let's do pow(n, M) to save more digits after comma: + * + * - maximum int bit length is 64 + * - ilog2(MAX_SAMPLE_SIZE) -> 13 + * - 13 * 4 = 52 < 64 -> M = 4 + * + * So use pow(n, 4). + */ +static inline u32 ilog2_w(u64 n) +{ + return ilog2(n * n * n * n); +} + +static u32 shannon_entropy(struct heuristic_ws *ws) +{ + const u32 entropy_max = 8 * ilog2_w(2); + u32 entropy_sum = 0; + u32 p, p_base, sz_base; + u32 i; + + sz_base = ilog2_w(ws->sample_size); + for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) { + p = ws->bucket[i].count; + p_base = ilog2_w(p); + entropy_sum += p * (sz_base - p_base); + } + + entropy_sum /= ws->sample_size; + return entropy_sum * 100 / entropy_max; +} + +#define RADIX_BASE 4U +#define COUNTERS_SIZE (1U << RADIX_BASE) + +static u8 get4bits(u64 num, int shift) { + u8 low4bits; + + num >>= shift; + /* Reverse order */ + low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE); + return low4bits; +} + +/* + * Use 4 bits as radix base + * Use 16 u32 counters for calculating new position in buf array + * + * @array - array that will be sorted + * @array_buf - buffer array to store sorting results + * must be equal in size to @array + * @num - array size + */ +static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf, + int num) +{ + u64 max_num; + u64 buf_num; + u32 counters[COUNTERS_SIZE]; + u32 new_addr; + u32 addr; + int bitlen; + int shift; + int i; + + /* + * Try avoid useless loop iterations for small numbers stored in big + * counters. Example: 48 33 4 ... in 64bit array + */ + max_num = array[0].count; + for (i = 1; i < num; i++) { + buf_num = array[i].count; + if (buf_num > max_num) + max_num = buf_num; + } + + buf_num = ilog2(max_num); + bitlen = ALIGN(buf_num, RADIX_BASE * 2); + + shift = 0; + while (shift < bitlen) { + memset(counters, 0, sizeof(counters)); + + for (i = 0; i < num; i++) { + buf_num = array[i].count; + addr = get4bits(buf_num, shift); + counters[addr]++; + } + + for (i = 1; i < COUNTERS_SIZE; i++) + counters[i] += counters[i - 1]; + + for (i = num - 1; i >= 0; i--) { + buf_num = array[i].count; + addr = get4bits(buf_num, shift); + counters[addr]--; + new_addr = counters[addr]; + array_buf[new_addr] = array[i]; + } + + shift += RADIX_BASE; + + /* + * Normal radix expects to move data from a temporary array, to + * the main one. But that requires some CPU time. Avoid that + * by doing another sort iteration to original array instead of + * memcpy() + */ + memset(counters, 0, sizeof(counters)); + + for (i = 0; i < num; i ++) { + buf_num = array_buf[i].count; + addr = get4bits(buf_num, shift); + counters[addr]++; + } + + for (i = 1; i < COUNTERS_SIZE; i++) + counters[i] += counters[i - 1]; + + for (i = num - 1; i >= 0; i--) { + buf_num = array_buf[i].count; + addr = get4bits(buf_num, shift); + counters[addr]--; + new_addr = counters[addr]; + array[new_addr] = array_buf[i]; + } + + shift += RADIX_BASE; + } +} + +/* + * Size of the core byte set - how many bytes cover 90% of the sample + * + * There are several types of structured binary data that use nearly all byte + * values. The distribution can be uniform and counts in all buckets will be + * nearly the same (eg. encrypted data). Unlikely to be compressible. + * + * Other possibility is normal (Gaussian) distribution, where the data could + * be potentially compressible, but we have to take a few more steps to decide + * how much. + * + * @BYTE_CORE_SET_LOW - main part of byte values repeated frequently, + * compression algo can easy fix that + * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high + * probability is not compressible + */ +#define BYTE_CORE_SET_LOW (64) +#define BYTE_CORE_SET_HIGH (200) + +static int byte_core_set_size(struct heuristic_ws *ws) +{ + u32 i; + u32 coreset_sum = 0; + const u32 core_set_threshold = ws->sample_size * 90 / 100; + struct bucket_item *bucket = ws->bucket; + + /* Sort in reverse order */ + radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE); + + for (i = 0; i < BYTE_CORE_SET_LOW; i++) + coreset_sum += bucket[i].count; + + if (coreset_sum > core_set_threshold) + return i; + + for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) { + coreset_sum += bucket[i].count; + if (coreset_sum > core_set_threshold) + break; + } + + return i; +} + +/* + * Count byte values in buckets. + * This heuristic can detect textual data (configs, xml, json, html, etc). + * Because in most text-like data byte set is restricted to limited number of + * possible characters, and that restriction in most cases makes data easy to + * compress. + * + * @BYTE_SET_THRESHOLD - consider all data within this byte set size: + * less - compressible + * more - need additional analysis + */ +#define BYTE_SET_THRESHOLD (64) + +static u32 byte_set_size(const struct heuristic_ws *ws) +{ + u32 i; + u32 byte_set_size = 0; + + for (i = 0; i < BYTE_SET_THRESHOLD; i++) { + if (ws->bucket[i].count > 0) + byte_set_size++; + } + + /* + * Continue collecting count of byte values in buckets. If the byte + * set size is bigger then the threshold, it's pointless to continue, + * the detection technique would fail for this type of data. + */ + for (; i < BUCKET_SIZE; i++) { + if (ws->bucket[i].count > 0) { + byte_set_size++; + if (byte_set_size > BYTE_SET_THRESHOLD) + return byte_set_size; + } + } + + return byte_set_size; +} + +static bool sample_repeated_patterns(struct heuristic_ws *ws) +{ + const u32 half_of_sample = ws->sample_size / 2; + const u8 *data = ws->sample; + + return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0; +} + +static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, + struct heuristic_ws *ws) +{ + struct page *page; + u64 index, index_end; + u32 i, curr_sample_pos; + u8 *in_data; + + /* + * Compression handles the input data by chunks of 128KiB + * (defined by BTRFS_MAX_UNCOMPRESSED) + * + * We do the same for the heuristic and loop over the whole range. + * + * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will + * process no more than BTRFS_MAX_UNCOMPRESSED at a time. + */ + if (end - start > BTRFS_MAX_UNCOMPRESSED) + end = start + BTRFS_MAX_UNCOMPRESSED; + + index = start >> PAGE_SHIFT; + index_end = end >> PAGE_SHIFT; + + /* Don't miss unaligned end */ + if (!PAGE_ALIGNED(end)) + index_end++; + + curr_sample_pos = 0; + while (index < index_end) { + page = find_get_page(inode->i_mapping, index); + in_data = kmap_local_page(page); + /* Handle case where the start is not aligned to PAGE_SIZE */ + i = start % PAGE_SIZE; + while (i < PAGE_SIZE - SAMPLING_READ_SIZE) { + /* Don't sample any garbage from the last page */ + if (start > end - SAMPLING_READ_SIZE) + break; + memcpy(&ws->sample[curr_sample_pos], &in_data[i], + SAMPLING_READ_SIZE); + i += SAMPLING_INTERVAL; + start += SAMPLING_INTERVAL; + curr_sample_pos += SAMPLING_READ_SIZE; + } + kunmap_local(in_data); + put_page(page); + + index++; + } + + ws->sample_size = curr_sample_pos; +} + +/* + * Compression heuristic. + * + * For now is's a naive and optimistic 'return true', we'll extend the logic to + * quickly (compared to direct compression) detect data characteristics + * (compressible/incompressible) to avoid wasting CPU time on incompressible + * data. + * + * The following types of analysis can be performed: + * - detect mostly zero data + * - detect data with low "byte set" size (text, etc) + * - detect data with low/high "core byte" set + * + * Return non-zero if the compression should be done, 0 otherwise. + */ +int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) +{ + struct list_head *ws_list = get_workspace(0, 0); + struct heuristic_ws *ws; + u32 i; + u8 byte; + int ret = 0; + + ws = list_entry(ws_list, struct heuristic_ws, list); + + heuristic_collect_sample(inode, start, end, ws); + + if (sample_repeated_patterns(ws)) { + ret = 1; + goto out; + } + + memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE); + + for (i = 0; i < ws->sample_size; i++) { + byte = ws->sample[i]; + ws->bucket[byte].count++; + } + + i = byte_set_size(ws); + if (i < BYTE_SET_THRESHOLD) { + ret = 2; + goto out; + } + + i = byte_core_set_size(ws); + if (i <= BYTE_CORE_SET_LOW) { + ret = 3; + goto out; + } + + if (i >= BYTE_CORE_SET_HIGH) { + ret = 0; + goto out; + } + + i = shannon_entropy(ws); + if (i <= ENTROPY_LVL_ACEPTABLE) { + ret = 4; + goto out; + } + + /* + * For the levels below ENTROPY_LVL_HIGH, additional analysis would be + * needed to give green light to compression. + * + * For now just assume that compression at that level is not worth the + * resources because: + * + * 1. it is possible to defrag the data later + * + * 2. the data would turn out to be hardly compressible, eg. 150 byte + * values, every bucket has counter at level ~54. The heuristic would + * be confused. This can happen when data have some internal repeated + * patterns like "abbacbbc...". This can be detected by analyzing + * pairs of bytes, which is too costly. + */ + if (i < ENTROPY_LVL_HIGH) { + ret = 5; + goto out; + } else { + ret = 0; + goto out; + } + +out: + put_workspace(0, ws_list); + return ret; +} + +/* + * Convert the compression suffix (eg. after "zlib" starting with ":") to + * level, unrecognized string will set the default level + */ +unsigned int btrfs_compress_str2level(unsigned int type, const char *str) +{ + unsigned int level = 0; + int ret; + + if (!type) + return 0; + + if (str[0] == ':') { + ret = kstrtouint(str + 1, 10, &level); + if (ret) + level = 0; + } + + level = btrfs_compress_set_level(type, level); + + return level; +} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h new file mode 100644 index 0000000000..03bb9d143f --- /dev/null +++ b/fs/btrfs/compression.h @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + */ + +#ifndef BTRFS_COMPRESSION_H +#define BTRFS_COMPRESSION_H + +#include +#include "bio.h" + +struct btrfs_inode; +struct btrfs_ordered_extent; + +/* + * We want to make sure that amount of RAM required to uncompress an extent is + * reasonable, so we limit the total size in ram of a compressed extent to + * 128k. This is a crucial number because it also controls how easily we can + * spread reads across cpus for decompression. + * + * We also want to make sure the amount of IO required to do a random read is + * reasonably small, so we limit the size of a compressed extent to 128k. + */ + +/* Maximum length of compressed data stored on disk */ +#define BTRFS_MAX_COMPRESSED (SZ_128K) +#define BTRFS_MAX_COMPRESSED_PAGES (BTRFS_MAX_COMPRESSED / PAGE_SIZE) +static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); + +/* Maximum size of data before compression */ +#define BTRFS_MAX_UNCOMPRESSED (SZ_128K) + +#define BTRFS_ZLIB_DEFAULT_LEVEL 3 + +struct compressed_bio { + /* Number of compressed pages in the array */ + unsigned int nr_pages; + + /* the pages with the compressed data on them */ + struct page **compressed_pages; + + /* starting offset in the inode for our pages */ + u64 start; + + /* Number of bytes in the inode we're working on */ + unsigned int len; + + /* Number of bytes on disk */ + unsigned int compressed_len; + + /* The compression algorithm for this bio */ + u8 compress_type; + + /* Whether this is a write for writeback. */ + bool writeback; + + union { + /* For reads, this is the bio we are copying the data into */ + struct btrfs_bio *orig_bbio; + struct work_struct write_end_work; + }; + + /* Must be last. */ + struct btrfs_bio bbio; +}; + +static inline unsigned int btrfs_compress_type(unsigned int type_level) +{ + return (type_level & 0xF); +} + +static inline unsigned int btrfs_compress_level(unsigned int type_level) +{ + return ((type_level & 0xF0) >> 4); +} + +int __init btrfs_init_compress(void); +void __cold btrfs_exit_compress(void); + +int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, + u64 start, struct page **pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out); +int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen); +int btrfs_decompress_buf2page(const char *buf, u32 buf_len, + struct compressed_bio *cb, u32 decompressed); + +void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, + struct page **compressed_pages, + unsigned int nr_pages, + blk_opf_t write_flags, + bool writeback); +void btrfs_submit_compressed_read(struct btrfs_bio *bbio); + +unsigned int btrfs_compress_str2level(unsigned int type, const char *str); + +enum btrfs_compression_type { + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LZO = 2, + BTRFS_COMPRESS_ZSTD = 3, + BTRFS_NR_COMPRESS_TYPES = 4, +}; + +struct workspace_manager { + struct list_head idle_ws; + spinlock_t ws_lock; + /* Number of free workspaces */ + int free_ws; + /* Total number of allocated workspaces */ + atomic_t total_ws; + /* Waiters for a free workspace */ + wait_queue_head_t ws_wait; +}; + +struct list_head *btrfs_get_workspace(int type, unsigned int level); +void btrfs_put_workspace(int type, struct list_head *ws); + +struct btrfs_compress_op { + struct workspace_manager *workspace_manager; + /* Maximum level supported by the compression algorithm */ + unsigned int max_level; + unsigned int default_level; +}; + +/* The heuristic workspaces are managed via the 0th workspace manager */ +#define BTRFS_NR_WORKSPACE_MANAGERS BTRFS_NR_COMPRESS_TYPES + +extern const struct btrfs_compress_op btrfs_heuristic_compress; +extern const struct btrfs_compress_op btrfs_zlib_compress; +extern const struct btrfs_compress_op btrfs_lzo_compress; +extern const struct btrfs_compress_op btrfs_zstd_compress; + +const char* btrfs_compress_type2str(enum btrfs_compression_type type); +bool btrfs_compress_is_valid_type(const char *str, size_t len); + +int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); + +int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zlib_decompress(struct list_head *ws, const u8 *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *zlib_alloc_workspace(unsigned int level); +void zlib_free_workspace(struct list_head *ws); +struct list_head *zlib_get_workspace(unsigned int level); + +int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int lzo_decompress(struct list_head *ws, const u8 *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *lzo_alloc_workspace(unsigned int level); +void lzo_free_workspace(struct list_head *ws); + +int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zstd_decompress(struct list_head *ws, const u8 *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +void zstd_init_workspace_manager(void); +void zstd_cleanup_workspace_manager(void); +struct list_head *zstd_alloc_workspace(unsigned int level); +void zstd_free_workspace(struct list_head *ws); +struct list_head *zstd_get_workspace(unsigned int level); +void zstd_put_workspace(struct list_head *ws); + +#endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c new file mode 100644 index 0000000000..118ad4d2cb --- /dev/null +++ b/fs/btrfs/ctree.c @@ -0,0 +1,5224 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007,2008 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include "messages.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "locking.h" +#include "volumes.h" +#include "qgroup.h" +#include "tree-mod-log.h" +#include "tree-checker.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "relocation.h" +#include "file-item.h" + +static struct kmem_cache *btrfs_path_cachep; + +static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int level); +static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *ins_key, struct btrfs_path *path, + int data_size, int extend); +static int push_node_left(struct btrfs_trans_handle *trans, + struct extent_buffer *dst, + struct extent_buffer *src, int empty); +static int balance_node_right(struct btrfs_trans_handle *trans, + struct extent_buffer *dst_buf, + struct extent_buffer *src_buf); + +static const struct btrfs_csums { + u16 size; + const char name[10]; + const char driver[12]; +} btrfs_csums[] = { + [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, + [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, + [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, + [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", + .driver = "blake2b-256" }, +}; + +/* + * The leaf data grows from end-to-front in the node. this returns the address + * of the start of the last item, which is the stop of the leaf data stack. + */ +static unsigned int leaf_data_end(const struct extent_buffer *leaf) +{ + u32 nr = btrfs_header_nritems(leaf); + + if (nr == 0) + return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); + return btrfs_item_offset(leaf, nr - 1); +} + +/* + * Move data in a @leaf (using memmove, safe for overlapping ranges). + * + * @leaf: leaf that we're doing a memmove on + * @dst_offset: item data offset we're moving to + * @src_offset: item data offset were' moving from + * @len: length of the data we're moving + * + * Wrapper around memmove_extent_buffer() that takes into account the header on + * the leaf. The btrfs_item offset's start directly after the header, so we + * have to adjust any offsets to account for the header in the leaf. This + * handles that math to simplify the callers. + */ +static inline void memmove_leaf_data(const struct extent_buffer *leaf, + unsigned long dst_offset, + unsigned long src_offset, + unsigned long len) +{ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset, + btrfs_item_nr_offset(leaf, 0) + src_offset, len); +} + +/* + * Copy item data from @src into @dst at the given @offset. + * + * @dst: destination leaf that we're copying into + * @src: source leaf that we're copying from + * @dst_offset: item data offset we're copying to + * @src_offset: item data offset were' copying from + * @len: length of the data we're copying + * + * Wrapper around copy_extent_buffer() that takes into account the header on + * the leaf. The btrfs_item offset's start directly after the header, so we + * have to adjust any offsets to account for the header in the leaf. This + * handles that math to simplify the callers. + */ +static inline void copy_leaf_data(const struct extent_buffer *dst, + const struct extent_buffer *src, + unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset, + btrfs_item_nr_offset(src, 0) + src_offset, len); +} + +/* + * Move items in a @leaf (using memmove). + * + * @dst: destination leaf for the items + * @dst_item: the item nr we're copying into + * @src_item: the item nr we're copying from + * @nr_items: the number of items to copy + * + * Wrapper around memmove_extent_buffer() that does the math to get the + * appropriate offsets into the leaf from the item numbers. + */ +static inline void memmove_leaf_items(const struct extent_buffer *leaf, + int dst_item, int src_item, int nr_items) +{ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item), + btrfs_item_nr_offset(leaf, src_item), + nr_items * sizeof(struct btrfs_item)); +} + +/* + * Copy items from @src into @dst at the given @offset. + * + * @dst: destination leaf for the items + * @src: source leaf for the items + * @dst_item: the item nr we're copying into + * @src_item: the item nr we're copying from + * @nr_items: the number of items to copy + * + * Wrapper around copy_extent_buffer() that does the math to get the + * appropriate offsets into the leaf from the item numbers. + */ +static inline void copy_leaf_items(const struct extent_buffer *dst, + const struct extent_buffer *src, + int dst_item, int src_item, int nr_items) +{ + copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item), + btrfs_item_nr_offset(src, src_item), + nr_items * sizeof(struct btrfs_item)); +} + +/* This exists for btrfs-progs usages. */ +u16 btrfs_csum_type_size(u16 type) +{ + return btrfs_csums[type].size; +} + +int btrfs_super_csum_size(const struct btrfs_super_block *s) +{ + u16 t = btrfs_super_csum_type(s); + /* + * csum type is validated at mount time + */ + return btrfs_csum_type_size(t); +} + +const char *btrfs_super_csum_name(u16 csum_type) +{ + /* csum type is validated at mount time */ + return btrfs_csums[csum_type].name; +} + +/* + * Return driver name if defined, otherwise the name that's also a valid driver + * name + */ +const char *btrfs_super_csum_driver(u16 csum_type) +{ + /* csum type is validated at mount time */ + return btrfs_csums[csum_type].driver[0] ? + btrfs_csums[csum_type].driver : + btrfs_csums[csum_type].name; +} + +size_t __attribute_const__ btrfs_get_num_csums(void) +{ + return ARRAY_SIZE(btrfs_csums); +} + +struct btrfs_path *btrfs_alloc_path(void) +{ + might_sleep(); + + return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); +} + +/* this also releases the path */ +void btrfs_free_path(struct btrfs_path *p) +{ + if (!p) + return; + btrfs_release_path(p); + kmem_cache_free(btrfs_path_cachep, p); +} + +/* + * path release drops references on the extent buffers in the path + * and it drops any locks held by this path + * + * It is safe to call this on paths that no locks or extent buffers held. + */ +noinline void btrfs_release_path(struct btrfs_path *p) +{ + int i; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + p->slots[i] = 0; + if (!p->nodes[i]) + continue; + if (p->locks[i]) { + btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); + p->locks[i] = 0; + } + free_extent_buffer(p->nodes[i]); + p->nodes[i] = NULL; + } +} + +/* + * We want the transaction abort to print stack trace only for errors where the + * cause could be a bug, eg. due to ENOSPC, and not for common errors that are + * caused by external factors. + */ +bool __cold abort_should_print_stack(int errno) +{ + switch (errno) { + case -EIO: + case -EROFS: + case -ENOMEM: + return false; + } + return true; +} + +/* + * safely gets a reference on the root node of a tree. A lock + * is not taken, so a concurrent writer may put a different node + * at the root of the tree. See btrfs_lock_root_node for the + * looping required. + * + * The extent buffer returned by this has a reference taken, so + * it won't disappear. It may stop being the root of the tree + * at any time because there are no locks held. + */ +struct extent_buffer *btrfs_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + rcu_read_lock(); + eb = rcu_dereference(root->node); + + /* + * RCU really hurts here, we could free up the root node because + * it was COWed but we may not get the new root node yet so do + * the inc_not_zero dance and if it doesn't work then + * synchronize_rcu and try again. + */ + if (atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + break; + } + rcu_read_unlock(); + synchronize_rcu(); + } + return eb; +} + +/* + * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), + * just get put onto a simple dirty list. Transaction walks this list to make + * sure they get properly updated on disk. + */ +static void add_root_to_dirty_list(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || + !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) + return; + + spin_lock(&fs_info->trans_lock); + if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { + /* Want the extent tree to be the last on the list */ + if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) + list_move_tail(&root->dirty_list, + &fs_info->dirty_cowonly_roots); + else + list_move(&root->dirty_list, + &fs_info->dirty_cowonly_roots); + } + spin_unlock(&fs_info->trans_lock); +} + +/* + * used by snapshot creation to make a copy of a root for a tree with + * a given objectid. The buffer with the new root node is returned in + * cow_ret, and this func returns zero on success or a negative error code. + */ +int btrfs_copy_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer **cow_ret, u64 new_root_objectid) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *cow; + int ret = 0; + int level; + struct btrfs_disk_key disk_key; + + WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && + trans->transid != fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && + trans->transid != root->last_trans); + + level = btrfs_header_level(buf); + if (level == 0) + btrfs_item_key(buf, &disk_key, 0); + else + btrfs_node_key(buf, &disk_key, 0); + + cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, + &disk_key, level, buf->start, 0, + BTRFS_NESTING_NEW_ROOT); + if (IS_ERR(cow)) + return PTR_ERR(cow); + + copy_extent_buffer_full(cow, buf); + btrfs_set_header_bytenr(cow, cow->start); + btrfs_set_header_generation(cow, trans->transid); + btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | + BTRFS_HEADER_FLAG_RELOC); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); + else + btrfs_set_header_owner(cow, new_root_objectid); + + write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); + + WARN_ON(btrfs_header_generation(buf) > trans->transid); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); + return ret; + } + + btrfs_mark_buffer_dirty(trans, cow); + *cow_ret = cow; + return 0; +} + +/* + * check if the tree block can be shared by multiple trees + */ +int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf) +{ + /* + * Tree blocks not in shareable trees and tree roots are never shared. + * If a block was allocated after the last snapshot and the block was + * not allocated by tree relocation, we know the block is not shared. + */ + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && + buf != root->node && + (btrfs_header_generation(buf) <= + btrfs_root_last_snapshot(&root->root_item) || + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) { + if (buf != root->commit_root) + return 1; + /* + * An extent buffer that used to be the commit root may still be + * shared because the tree height may have increased and it + * became a child of a higher level root. This can happen when + * snapshotting a subvolume created in the current transaction. + */ + if (btrfs_header_generation(buf) == trans->transid) + return 1; + } + + return 0; +} + +static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *cow, + int *last_ref) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 refs; + u64 owner; + u64 flags; + u64 new_flags = 0; + int ret; + + /* + * Backrefs update rules: + * + * Always use full backrefs for extent pointers in tree block + * allocated by tree relocation. + * + * If a shared tree block is no longer referenced by its owner + * tree (btrfs_header_owner(buf) == root->root_key.objectid), + * use full backrefs for extent pointers in tree block. + * + * If a tree block is been relocating + * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), + * use full backrefs for extent pointers in tree block. + * The reason for this is some operations (such as drop tree) + * are only allowed for blocks use full backrefs. + */ + + if (btrfs_block_can_be_shared(trans, root, buf)) { + ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, + btrfs_header_level(buf), 1, + &refs, &flags); + if (ret) + return ret; + if (unlikely(refs == 0)) { + btrfs_crit(fs_info, + "found 0 references for tree block at bytenr %llu level %d root %llu", + buf->start, btrfs_header_level(buf), + btrfs_root_id(root)); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } + } else { + refs = 1; + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; + else + flags = 0; + } + + owner = btrfs_header_owner(buf); + BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && + !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + + if (refs > 1) { + if ((owner == root->root_key.objectid || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && + !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { + ret = btrfs_inc_ref(trans, root, buf, 1); + if (ret) + return ret; + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_dec_ref(trans, root, buf, 0); + if (ret) + return ret; + ret = btrfs_inc_ref(trans, root, cow, 1); + if (ret) + return ret; + } + new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else { + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + if (ret) + return ret; + } + if (new_flags != 0) { + ret = btrfs_set_disk_extent_flags(trans, buf, new_flags); + if (ret) + return ret; + } + } else { + if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + if (ret) + return ret; + ret = btrfs_dec_ref(trans, root, buf, 1); + if (ret) + return ret; + } + btrfs_clear_buffer_dirty(trans, buf); + *last_ref = 1; + } + return 0; +} + +/* + * does the dirty work in cow of a single block. The parent block (if + * supplied) is updated to point to the new cow copy. The new buffer is marked + * dirty and returned locked. If you modify the block it needs to be marked + * dirty again. + * + * search_start -- an allocation hint for the new block + * + * empty_size -- a hint that you plan on doing more cow. This is the size in + * bytes the allocator should try to find free next to the block it returns. + * This is just a hint and may be ignored by the allocator. + */ +static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size, + enum btrfs_lock_nesting nest) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_disk_key disk_key; + struct extent_buffer *cow; + int level, ret; + int last_ref = 0; + int unlock_orig = 0; + u64 parent_start = 0; + + if (*cow_ret == buf) + unlock_orig = 1; + + btrfs_assert_tree_write_locked(buf); + + WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && + trans->transid != fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && + trans->transid != root->last_trans); + + level = btrfs_header_level(buf); + + if (level == 0) + btrfs_item_key(buf, &disk_key, 0); + else + btrfs_node_key(buf, &disk_key, 0); + + if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) + parent_start = parent->start; + + cow = btrfs_alloc_tree_block(trans, root, parent_start, + root->root_key.objectid, &disk_key, level, + search_start, empty_size, nest); + if (IS_ERR(cow)) + return PTR_ERR(cow); + + /* cow is set to blocking by btrfs_init_new_buffer */ + + copy_extent_buffer_full(cow, buf); + btrfs_set_header_bytenr(cow, cow->start); + btrfs_set_header_generation(cow, trans->transid); + btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | + BTRFS_HEADER_FLAG_RELOC); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); + else + btrfs_set_header_owner(cow, root->root_key.objectid); + + write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); + + ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); + if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); + return ret; + } + + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { + ret = btrfs_reloc_cow_block(trans, root, buf, cow); + if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); + return ret; + } + } + + if (buf == root->node) { + WARN_ON(parent && parent != buf); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + parent_start = buf->start; + + ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); + if (ret < 0) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); + return ret; + } + atomic_inc(&cow->refs); + rcu_assign_pointer(root->node, cow); + + btrfs_free_tree_block(trans, btrfs_root_id(root), buf, + parent_start, last_ref); + free_extent_buffer(buf); + add_root_to_dirty_list(root); + } else { + WARN_ON(trans->transid != btrfs_header_generation(parent)); + ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, + BTRFS_MOD_LOG_KEY_REPLACE); + if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); + return ret; + } + btrfs_set_node_blockptr(parent, parent_slot, + cow->start); + btrfs_set_node_ptr_generation(parent, parent_slot, + trans->transid); + btrfs_mark_buffer_dirty(trans, parent); + if (last_ref) { + ret = btrfs_tree_mod_log_free_eb(buf); + if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); + return ret; + } + } + btrfs_free_tree_block(trans, btrfs_root_id(root), buf, + parent_start, last_ref); + } + if (unlock_orig) + btrfs_tree_unlock(buf); + free_extent_buffer_stale(buf); + btrfs_mark_buffer_dirty(trans, cow); + *cow_ret = cow; + return 0; +} + +static inline int should_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf) +{ + if (btrfs_is_testing(root->fs_info)) + return 0; + + /* Ensure we can see the FORCE_COW bit */ + smp_mb__before_atomic(); + + /* + * We do not need to cow a block if + * 1) this block is not created or changed in this transaction; + * 2) this block does not belong to TREE_RELOC tree; + * 3) the root is not forced COW. + * + * What is forced COW: + * when we create snapshot during committing the transaction, + * after we've finished copying src root, we must COW the shared + * block to ensure the metadata consistency. + */ + if (btrfs_header_generation(buf) == trans->transid && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && + !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && + !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) + return 0; + return 1; +} + +/* + * cows a single block, see __btrfs_cow_block for the real work. + * This version of it has extra checks so that a block isn't COWed more than + * once per transaction, as long as it hasn't been written yet + */ +noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + enum btrfs_lock_nesting nest) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 search_start; + int ret; + + if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, + "attempt to COW block %llu on root %llu that is being deleted", + buf->start, btrfs_root_id(root)); + return -EUCLEAN; + } + + /* + * COWing must happen through a running transaction, which always + * matches the current fs generation (it's a transaction with a state + * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs + * into error state to prevent the commit of any transaction. + */ + if (unlikely(trans->transaction != fs_info->running_transaction || + trans->transid != fs_info->generation)) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu", + buf->start, btrfs_root_id(root), trans->transid, + fs_info->running_transaction->transid, + fs_info->generation); + return -EUCLEAN; + } + + if (!should_cow_block(trans, root, buf)) { + *cow_ret = buf; + return 0; + } + + search_start = buf->start & ~((u64)SZ_1G - 1); + + /* + * Before CoWing this block for later modification, check if it's + * the subtree root and do the delayed subtree trace if needed. + * + * Also We don't care about the error, as it's handled internally. + */ + btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); + ret = __btrfs_cow_block(trans, root, buf, parent, + parent_slot, cow_ret, search_start, 0, nest); + + trace_btrfs_cow_block(root, buf, *cow_ret); + + return ret; +} +ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); + +/* + * helper function for defrag to decide if two blocks pointed to by a + * node are actually close by + */ +static int close_blocks(u64 blocknr, u64 other, u32 blocksize) +{ + if (blocknr < other && other - (blocknr + blocksize) < 32768) + return 1; + if (blocknr > other && blocknr - (other + blocksize) < 32768) + return 1; + return 0; +} + +#ifdef __LITTLE_ENDIAN + +/* + * Compare two keys, on little-endian the disk order is same as CPU order and + * we can avoid the conversion. + */ +static int comp_keys(const struct btrfs_disk_key *disk_key, + const struct btrfs_key *k2) +{ + const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; + + return btrfs_comp_cpu_keys(k1, k2); +} + +#else + +/* + * compare two keys in a memcmp fashion + */ +static int comp_keys(const struct btrfs_disk_key *disk, + const struct btrfs_key *k2) +{ + struct btrfs_key k1; + + btrfs_disk_key_to_cpu(&k1, disk); + + return btrfs_comp_cpu_keys(&k1, k2); +} +#endif + +/* + * same as comp_keys only with two btrfs_key's + */ +int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) +{ + if (k1->objectid > k2->objectid) + return 1; + if (k1->objectid < k2->objectid) + return -1; + if (k1->type > k2->type) + return 1; + if (k1->type < k2->type) + return -1; + if (k1->offset > k2->offset) + return 1; + if (k1->offset < k2->offset) + return -1; + return 0; +} + +/* + * this is used by the defrag code to go through all the + * leaves pointed to by a node and reallocate them so that + * disk order is close to key order + */ +int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *parent, + int start_slot, u64 *last_ret, + struct btrfs_key *progress) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *cur; + u64 blocknr; + u64 search_start = *last_ret; + u64 last_block = 0; + u64 other; + u32 parent_nritems; + int end_slot; + int i; + int err = 0; + u32 blocksize; + int progress_passed = 0; + struct btrfs_disk_key disk_key; + + /* + * COWing must happen through a running transaction, which always + * matches the current fs generation (it's a transaction with a state + * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs + * into error state to prevent the commit of any transaction. + */ + if (unlikely(trans->transaction != fs_info->running_transaction || + trans->transid != fs_info->generation)) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu", + parent->start, btrfs_root_id(root), trans->transid, + fs_info->running_transaction->transid, + fs_info->generation); + return -EUCLEAN; + } + + parent_nritems = btrfs_header_nritems(parent); + blocksize = fs_info->nodesize; + end_slot = parent_nritems - 1; + + if (parent_nritems <= 1) + return 0; + + for (i = start_slot; i <= end_slot; i++) { + int close = 1; + + btrfs_node_key(parent, &disk_key, i); + if (!progress_passed && comp_keys(&disk_key, progress) < 0) + continue; + + progress_passed = 1; + blocknr = btrfs_node_blockptr(parent, i); + if (last_block == 0) + last_block = blocknr; + + if (i > 0) { + other = btrfs_node_blockptr(parent, i - 1); + close = close_blocks(blocknr, other, blocksize); + } + if (!close && i < end_slot) { + other = btrfs_node_blockptr(parent, i + 1); + close = close_blocks(blocknr, other, blocksize); + } + if (close) { + last_block = blocknr; + continue; + } + + cur = btrfs_read_node_slot(parent, i); + if (IS_ERR(cur)) + return PTR_ERR(cur); + if (search_start == 0) + search_start = last_block; + + btrfs_tree_lock(cur); + err = __btrfs_cow_block(trans, root, cur, parent, i, + &cur, search_start, + min(16 * blocksize, + (end_slot - i) * blocksize), + BTRFS_NESTING_COW); + if (err) { + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + break; + } + search_start = cur->start; + last_block = cur->start; + *last_ret = search_start; + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + } + return err; +} + +/* + * Search for a key in the given extent_buffer. + * + * The lower boundary for the search is specified by the slot number @first_slot. + * Use a value of 0 to search over the whole extent buffer. Works for both + * leaves and nodes. + * + * The slot in the extent buffer is returned via @slot. If the key exists in the + * extent buffer, then @slot will point to the slot where the key is, otherwise + * it points to the slot where you would insert the key. + * + * Slot may point to the total number of items (i.e. one position beyond the last + * key) if the key is bigger than the last key in the extent buffer. + */ +int btrfs_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot) +{ + unsigned long p; + int item_size; + /* + * Use unsigned types for the low and high slots, so that we get a more + * efficient division in the search loop below. + */ + u32 low = first_slot; + u32 high = btrfs_header_nritems(eb); + int ret; + const int key_size = sizeof(struct btrfs_disk_key); + + if (unlikely(low > high)) { + btrfs_err(eb->fs_info, + "%s: low (%u) > high (%u) eb %llu owner %llu level %d", + __func__, low, high, eb->start, + btrfs_header_owner(eb), btrfs_header_level(eb)); + return -EINVAL; + } + + if (btrfs_header_level(eb) == 0) { + p = offsetof(struct btrfs_leaf, items); + item_size = sizeof(struct btrfs_item); + } else { + p = offsetof(struct btrfs_node, ptrs); + item_size = sizeof(struct btrfs_key_ptr); + } + + while (low < high) { + unsigned long oip; + unsigned long offset; + struct btrfs_disk_key *tmp; + struct btrfs_disk_key unaligned; + int mid; + + mid = (low + high) / 2; + offset = p + mid * item_size; + oip = offset_in_page(offset); + + if (oip + key_size <= PAGE_SIZE) { + const unsigned long idx = get_eb_page_index(offset); + char *kaddr = page_address(eb->pages[idx]); + + oip = get_eb_offset_in_page(eb, offset); + tmp = (struct btrfs_disk_key *)(kaddr + oip); + } else { + read_extent_buffer(eb, &unaligned, offset, key_size); + tmp = &unaligned; + } + + ret = comp_keys(tmp, key); + + if (ret < 0) + low = mid + 1; + else if (ret > 0) + high = mid; + else { + *slot = mid; + return 0; + } + } + *slot = low; + return 1; +} + +static void root_add_used(struct btrfs_root *root, u32 size) +{ + spin_lock(&root->accounting_lock); + btrfs_set_root_used(&root->root_item, + btrfs_root_used(&root->root_item) + size); + spin_unlock(&root->accounting_lock); +} + +static void root_sub_used(struct btrfs_root *root, u32 size) +{ + spin_lock(&root->accounting_lock); + btrfs_set_root_used(&root->root_item, + btrfs_root_used(&root->root_item) - size); + spin_unlock(&root->accounting_lock); +} + +/* given a node and slot number, this reads the blocks it points to. The + * extent buffer is returned with a reference taken (but unlocked). + */ +struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, + int slot) +{ + int level = btrfs_header_level(parent); + struct btrfs_tree_parent_check check = { 0 }; + struct extent_buffer *eb; + + if (slot < 0 || slot >= btrfs_header_nritems(parent)) + return ERR_PTR(-ENOENT); + + ASSERT(level); + + check.level = level - 1; + check.transid = btrfs_node_ptr_generation(parent, slot); + check.owner_root = btrfs_header_owner(parent); + check.has_first_key = true; + btrfs_node_key_to_cpu(parent, &check.first_key, slot); + + eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), + &check); + if (IS_ERR(eb)) + return eb; + if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return ERR_PTR(-EIO); + } + + return eb; +} + +/* + * node level balancing, used to make sure nodes are in proper order for + * item deletion. We balance from the top down, so we have to make sure + * that a deletion won't leave an node completely empty later on. + */ +static noinline int balance_level(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; + int ret = 0; + int wret; + int pslot; + int orig_slot = path->slots[level]; + u64 orig_ptr; + + ASSERT(level > 0); + + mid = path->nodes[level]; + + WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); + WARN_ON(btrfs_header_generation(mid) != trans->transid); + + orig_ptr = btrfs_node_blockptr(mid, orig_slot); + + if (level < BTRFS_MAX_LEVEL - 1) { + parent = path->nodes[level + 1]; + pslot = path->slots[level + 1]; + } + + /* + * deal with the case where there is only one pointer in the root + * by promoting the node below to a root + */ + if (!parent) { + struct extent_buffer *child; + + if (btrfs_header_nritems(mid) != 1) + return 0; + + /* promote the child to a root */ + child = btrfs_read_node_slot(mid, 0); + if (IS_ERR(child)) { + ret = PTR_ERR(child); + goto out; + } + + btrfs_tree_lock(child); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child, + BTRFS_NESTING_COW); + if (ret) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + goto out; + } + + ret = btrfs_tree_mod_log_insert_root(root->node, child, true); + if (ret < 0) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + btrfs_abort_transaction(trans, ret); + goto out; + } + rcu_assign_pointer(root->node, child); + + add_root_to_dirty_list(root); + btrfs_tree_unlock(child); + + path->locks[level] = 0; + path->nodes[level] = NULL; + btrfs_clear_buffer_dirty(trans, mid); + btrfs_tree_unlock(mid); + /* once for the path */ + free_extent_buffer(mid); + + root_sub_used(root, mid->len); + btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); + /* once for the root ptr */ + free_extent_buffer_stale(mid); + return 0; + } + if (btrfs_header_nritems(mid) > + BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) + return 0; + + if (pslot) { + left = btrfs_read_node_slot(parent, pslot - 1); + if (IS_ERR(left)) { + ret = PTR_ERR(left); + left = NULL; + goto out; + } + + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); + wret = btrfs_cow_block(trans, root, left, + parent, pslot - 1, &left, + BTRFS_NESTING_LEFT_COW); + if (wret) { + ret = wret; + goto out; + } + } + + if (pslot + 1 < btrfs_header_nritems(parent)) { + right = btrfs_read_node_slot(parent, pslot + 1); + if (IS_ERR(right)) { + ret = PTR_ERR(right); + right = NULL; + goto out; + } + + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); + wret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, &right, + BTRFS_NESTING_RIGHT_COW); + if (wret) { + ret = wret; + goto out; + } + } + + /* first, try to make some room in the middle buffer */ + if (left) { + orig_slot += btrfs_header_nritems(left); + wret = push_node_left(trans, left, mid, 1); + if (wret < 0) + ret = wret; + } + + /* + * then try to empty the right most buffer into the middle + */ + if (right) { + wret = push_node_left(trans, mid, right, 1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + if (btrfs_header_nritems(right) == 0) { + btrfs_clear_buffer_dirty(trans, right); + btrfs_tree_unlock(right); + ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1); + if (ret < 0) { + free_extent_buffer_stale(right); + right = NULL; + goto out; + } + root_sub_used(root, right->len); + btrfs_free_tree_block(trans, btrfs_root_id(root), right, + 0, 1); + free_extent_buffer_stale(right); + right = NULL; + } else { + struct btrfs_disk_key right_key; + btrfs_node_key(right, &right_key, 0); + ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, + BTRFS_MOD_LOG_KEY_REPLACE); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } + btrfs_set_node_key(parent, &right_key, pslot + 1); + btrfs_mark_buffer_dirty(trans, parent); + } + } + if (btrfs_header_nritems(mid) == 1) { + /* + * we're not allowed to leave a node with one item in the + * tree during a delete. A deletion from lower in the tree + * could try to delete the only pointer in this node. + * So, pull some keys from the left. + * There has to be a left pointer at this point because + * otherwise we would have pulled some pointers from the + * right + */ + if (unlikely(!left)) { + btrfs_crit(fs_info, +"missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu", + parent->start, btrfs_header_level(parent), + mid->start, btrfs_root_id(root)); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + goto out; + } + wret = balance_node_right(trans, mid, left); + if (wret < 0) { + ret = wret; + goto out; + } + if (wret == 1) { + wret = push_node_left(trans, left, mid, 1); + if (wret < 0) + ret = wret; + } + BUG_ON(wret == 1); + } + if (btrfs_header_nritems(mid) == 0) { + btrfs_clear_buffer_dirty(trans, mid); + btrfs_tree_unlock(mid); + ret = btrfs_del_ptr(trans, root, path, level + 1, pslot); + if (ret < 0) { + free_extent_buffer_stale(mid); + mid = NULL; + goto out; + } + root_sub_used(root, mid->len); + btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); + free_extent_buffer_stale(mid); + mid = NULL; + } else { + /* update the parent key to reflect our changes */ + struct btrfs_disk_key mid_key; + btrfs_node_key(mid, &mid_key, 0); + ret = btrfs_tree_mod_log_insert_key(parent, pslot, + BTRFS_MOD_LOG_KEY_REPLACE); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } + btrfs_set_node_key(parent, &mid_key, pslot); + btrfs_mark_buffer_dirty(trans, parent); + } + + /* update the path */ + if (left) { + if (btrfs_header_nritems(left) > orig_slot) { + atomic_inc(&left->refs); + /* left was locked after cow */ + path->nodes[level] = left; + path->slots[level + 1] -= 1; + path->slots[level] = orig_slot; + if (mid) { + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } + } else { + orig_slot -= btrfs_header_nritems(left); + path->slots[level] = orig_slot; + } + } + /* double check we haven't messed things up */ + if (orig_ptr != + btrfs_node_blockptr(path->nodes[level], path->slots[level])) + BUG(); +out: + if (right) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + if (left) { + if (path->nodes[level] != left) + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + return ret; +} + +/* Node balancing for insertion. Here we only split or push nodes around + * when they are completely full. This is also done top down, so we + * have to be pessimistic. + */ +static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; + int ret = 0; + int wret; + int pslot; + int orig_slot = path->slots[level]; + + if (level == 0) + return 1; + + mid = path->nodes[level]; + WARN_ON(btrfs_header_generation(mid) != trans->transid); + + if (level < BTRFS_MAX_LEVEL - 1) { + parent = path->nodes[level + 1]; + pslot = path->slots[level + 1]; + } + + if (!parent) + return 1; + + /* first, try to make some room in the middle buffer */ + if (pslot) { + u32 left_nr; + + left = btrfs_read_node_slot(parent, pslot - 1); + if (IS_ERR(left)) + return PTR_ERR(left); + + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); + + left_nr = btrfs_header_nritems(left); + if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { + wret = 1; + } else { + ret = btrfs_cow_block(trans, root, left, parent, + pslot - 1, &left, + BTRFS_NESTING_LEFT_COW); + if (ret) + wret = 1; + else { + wret = push_node_left(trans, left, mid, 0); + } + } + if (wret < 0) + ret = wret; + if (wret == 0) { + struct btrfs_disk_key disk_key; + orig_slot += left_nr; + btrfs_node_key(mid, &disk_key, 0); + ret = btrfs_tree_mod_log_insert_key(parent, pslot, + BTRFS_MOD_LOG_KEY_REPLACE); + if (ret < 0) { + btrfs_tree_unlock(left); + free_extent_buffer(left); + btrfs_abort_transaction(trans, ret); + return ret; + } + btrfs_set_node_key(parent, &disk_key, pslot); + btrfs_mark_buffer_dirty(trans, parent); + if (btrfs_header_nritems(left) > orig_slot) { + path->nodes[level] = left; + path->slots[level + 1] -= 1; + path->slots[level] = orig_slot; + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } else { + orig_slot -= + btrfs_header_nritems(left); + path->slots[level] = orig_slot; + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + return 0; + } + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + + /* + * then try to empty the right most buffer into the middle + */ + if (pslot + 1 < btrfs_header_nritems(parent)) { + u32 right_nr; + + right = btrfs_read_node_slot(parent, pslot + 1); + if (IS_ERR(right)) + return PTR_ERR(right); + + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); + + right_nr = btrfs_header_nritems(right); + if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { + wret = 1; + } else { + ret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, + &right, BTRFS_NESTING_RIGHT_COW); + if (ret) + wret = 1; + else { + wret = balance_node_right(trans, right, mid); + } + } + if (wret < 0) + ret = wret; + if (wret == 0) { + struct btrfs_disk_key disk_key; + + btrfs_node_key(right, &disk_key, 0); + ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, + BTRFS_MOD_LOG_KEY_REPLACE); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + btrfs_abort_transaction(trans, ret); + return ret; + } + btrfs_set_node_key(parent, &disk_key, pslot + 1); + btrfs_mark_buffer_dirty(trans, parent); + + if (btrfs_header_nritems(mid) <= orig_slot) { + path->nodes[level] = right; + path->slots[level + 1] += 1; + path->slots[level] = orig_slot - + btrfs_header_nritems(mid); + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 0; + } + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 1; +} + +/* + * readahead one full node of leaves, finding things that are close + * to the block in 'slot', and triggering ra on them. + */ +static void reada_for_search(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + int level, int slot, u64 objectid) +{ + struct extent_buffer *node; + struct btrfs_disk_key disk_key; + u32 nritems; + u64 search; + u64 target; + u64 nread = 0; + u64 nread_max; + u32 nr; + u32 blocksize; + u32 nscan = 0; + + if (level != 1 && path->reada != READA_FORWARD_ALWAYS) + return; + + if (!path->nodes[level]) + return; + + node = path->nodes[level]; + + /* + * Since the time between visiting leaves is much shorter than the time + * between visiting nodes, limit read ahead of nodes to 1, to avoid too + * much IO at once (possibly random). + */ + if (path->reada == READA_FORWARD_ALWAYS) { + if (level > 1) + nread_max = node->fs_info->nodesize; + else + nread_max = SZ_128K; + } else { + nread_max = SZ_64K; + } + + search = btrfs_node_blockptr(node, slot); + blocksize = fs_info->nodesize; + if (path->reada != READA_FORWARD_ALWAYS) { + struct extent_buffer *eb; + + eb = find_extent_buffer(fs_info, search); + if (eb) { + free_extent_buffer(eb); + return; + } + } + + target = search; + + nritems = btrfs_header_nritems(node); + nr = slot; + + while (1) { + if (path->reada == READA_BACK) { + if (nr == 0) + break; + nr--; + } else if (path->reada == READA_FORWARD || + path->reada == READA_FORWARD_ALWAYS) { + nr++; + if (nr >= nritems) + break; + } + if (path->reada == READA_BACK && objectid) { + btrfs_node_key(node, &disk_key, nr); + if (btrfs_disk_key_objectid(&disk_key) != objectid) + break; + } + search = btrfs_node_blockptr(node, nr); + if (path->reada == READA_FORWARD_ALWAYS || + (search <= target && target - search <= 65536) || + (search > target && search - target <= 65536)) { + btrfs_readahead_node_child(node, nr); + nread += blocksize; + } + nscan++; + if (nread > nread_max || nscan > 32) + break; + } +} + +static noinline void reada_for_balance(struct btrfs_path *path, int level) +{ + struct extent_buffer *parent; + int slot; + int nritems; + + parent = path->nodes[level + 1]; + if (!parent) + return; + + nritems = btrfs_header_nritems(parent); + slot = path->slots[level + 1]; + + if (slot > 0) + btrfs_readahead_node_child(parent, slot - 1); + if (slot + 1 < nritems) + btrfs_readahead_node_child(parent, slot + 1); +} + + +/* + * when we walk down the tree, it is usually safe to unlock the higher layers + * in the tree. The exceptions are when our path goes through slot 0, because + * operations on the tree might require changing key pointers higher up in the + * tree. + * + * callers might also have set path->keep_locks, which tells this code to keep + * the lock if the path points to the last slot in the block. This is part of + * walking through the tree, and selecting the next slot in the higher block. + * + * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so + * if lowest_unlock is 1, level 0 won't be unlocked + */ +static noinline void unlock_up(struct btrfs_path *path, int level, + int lowest_unlock, int min_write_lock_level, + int *write_lock_level) +{ + int i; + int skip_level = level; + bool check_skip = true; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + break; + if (!path->locks[i]) + break; + + if (check_skip) { + if (path->slots[i] == 0) { + skip_level = i + 1; + continue; + } + + if (path->keep_locks) { + u32 nritems; + + nritems = btrfs_header_nritems(path->nodes[i]); + if (nritems < 1 || path->slots[i] >= nritems - 1) { + skip_level = i + 1; + continue; + } + } + } + + if (i >= lowest_unlock && i > skip_level) { + check_skip = false; + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); + path->locks[i] = 0; + if (write_lock_level && + i > min_write_lock_level && + i <= *write_lock_level) { + *write_lock_level = i - 1; + } + } + } +} + +/* + * Helper function for btrfs_search_slot() and other functions that do a search + * on a btree. The goal is to find a tree block in the cache (the radix tree at + * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read + * its pages from disk. + * + * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the + * whole btree search, starting again from the current root node. + */ +static int +read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer **eb_ret, int level, int slot, + const struct btrfs_key *key) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_tree_parent_check check = { 0 }; + u64 blocknr; + u64 gen; + struct extent_buffer *tmp; + int ret; + int parent_level; + bool unlock_up; + + unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); + blocknr = btrfs_node_blockptr(*eb_ret, slot); + gen = btrfs_node_ptr_generation(*eb_ret, slot); + parent_level = btrfs_header_level(*eb_ret); + btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); + check.has_first_key = true; + check.level = parent_level - 1; + check.transid = gen; + check.owner_root = root->root_key.objectid; + + /* + * If we need to read an extent buffer from disk and we are holding locks + * on upper level nodes, we unlock all the upper nodes before reading the + * extent buffer, and then return -EAGAIN to the caller as it needs to + * restart the search. We don't release the lock on the current level + * because we need to walk this node to figure out which blocks to read. + */ + tmp = find_extent_buffer(fs_info, blocknr); + if (tmp) { + if (p->reada == READA_FORWARD_ALWAYS) + reada_for_search(fs_info, p, level, slot, key->objectid); + + /* first we do an atomic uptodate check */ + if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { + /* + * Do extra check for first_key, eb can be stale due to + * being cached, read from scrub, or have multiple + * parents (shared tree blocks). + */ + if (btrfs_verify_level_key(tmp, + parent_level - 1, &check.first_key, gen)) { + free_extent_buffer(tmp); + return -EUCLEAN; + } + *eb_ret = tmp; + return 0; + } + + if (p->nowait) { + free_extent_buffer(tmp); + return -EAGAIN; + } + + if (unlock_up) + btrfs_unlock_up_safe(p, level + 1); + + /* now we're allowed to do a blocking uptodate check */ + ret = btrfs_read_extent_buffer(tmp, &check); + if (ret) { + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EIO; + } + if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) { + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EUCLEAN; + } + + if (unlock_up) + ret = -EAGAIN; + + goto out; + } else if (p->nowait) { + return -EAGAIN; + } + + if (unlock_up) { + btrfs_unlock_up_safe(p, level + 1); + ret = -EAGAIN; + } else { + ret = 0; + } + + if (p->reada != READA_NONE) + reada_for_search(fs_info, p, level, slot, key->objectid); + + tmp = read_tree_block(fs_info, blocknr, &check); + if (IS_ERR(tmp)) { + btrfs_release_path(p); + return PTR_ERR(tmp); + } + /* + * If the read above didn't mark this buffer up to date, + * it will never end up being up to date. Set ret to EIO now + * and give up so that our caller doesn't loop forever + * on our EAGAINs. + */ + if (!extent_buffer_uptodate(tmp)) + ret = -EIO; + +out: + if (ret == 0) { + *eb_ret = tmp; + } else { + free_extent_buffer(tmp); + btrfs_release_path(p); + } + + return ret; +} + +/* + * helper function for btrfs_search_slot. This does all of the checks + * for node-level blocks and does any balancing required based on + * the ins_len. + * + * If no extra work was required, zero is returned. If we had to + * drop the path, -EAGAIN is returned and btrfs_search_slot must + * start over + */ +static int +setup_nodes_for_search(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer *b, int level, int ins_len, + int *write_lock_level) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; + + if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= + BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { + + if (*write_lock_level < level + 1) { + *write_lock_level = level + 1; + btrfs_release_path(p); + return -EAGAIN; + } + + reada_for_balance(p, level); + ret = split_node(trans, root, p, level); + + b = p->nodes[level]; + } else if (ins_len < 0 && btrfs_header_nritems(b) < + BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { + + if (*write_lock_level < level + 1) { + *write_lock_level = level + 1; + btrfs_release_path(p); + return -EAGAIN; + } + + reada_for_balance(p, level); + ret = balance_level(trans, root, p, level); + if (ret) + return ret; + + b = p->nodes[level]; + if (!b) { + btrfs_release_path(p); + return -EAGAIN; + } + BUG_ON(btrfs_header_nritems(b) == 1); + } + return ret; +} + +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, + u64 iobjectid, u64 ioff, u8 key_type, + struct btrfs_key *found_key) +{ + int ret; + struct btrfs_key key; + struct extent_buffer *eb; + + ASSERT(path); + ASSERT(found_key); + + key.type = key_type; + key.objectid = iobjectid; + key.offset = ioff; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + eb = path->nodes[0]; + if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(fs_root, path); + if (ret) + return ret; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); + if (found_key->type != key.type || + found_key->objectid != key.objectid) + return 1; + + return 0; +} + +static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, + struct btrfs_path *p, + int write_lock_level) +{ + struct extent_buffer *b; + int root_lock = 0; + int level = 0; + + if (p->search_commit_root) { + b = root->commit_root; + atomic_inc(&b->refs); + level = btrfs_header_level(b); + /* + * Ensure that all callers have set skip_locking when + * p->search_commit_root = 1. + */ + ASSERT(p->skip_locking == 1); + + goto out; + } + + if (p->skip_locking) { + b = btrfs_root_node(root); + level = btrfs_header_level(b); + goto out; + } + + /* We try very hard to do read locks on the root */ + root_lock = BTRFS_READ_LOCK; + + /* + * If the level is set to maximum, we can skip trying to get the read + * lock. + */ + if (write_lock_level < BTRFS_MAX_LEVEL) { + /* + * We don't know the level of the root node until we actually + * have it read locked + */ + if (p->nowait) { + b = btrfs_try_read_lock_root_node(root); + if (IS_ERR(b)) + return b; + } else { + b = btrfs_read_lock_root_node(root); + } + level = btrfs_header_level(b); + if (level > write_lock_level) + goto out; + + /* Whoops, must trade for write lock */ + btrfs_tree_read_unlock(b); + free_extent_buffer(b); + } + + b = btrfs_lock_root_node(root); + root_lock = BTRFS_WRITE_LOCK; + + /* The level might have changed, check again */ + level = btrfs_header_level(b); + +out: + /* + * The root may have failed to write out at some point, and thus is no + * longer valid, return an error in this case. + */ + if (!extent_buffer_uptodate(b)) { + if (root_lock) + btrfs_tree_unlock_rw(b, root_lock); + free_extent_buffer(b); + return ERR_PTR(-EIO); + } + + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = root_lock; + /* + * Callers are responsible for dropping b's references. + */ + return b; +} + +/* + * Replace the extent buffer at the lowest level of the path with a cloned + * version. The purpose is to be able to use it safely, after releasing the + * commit root semaphore, even if relocation is happening in parallel, the + * transaction used for relocation is committed and the extent buffer is + * reallocated in the next transaction. + * + * This is used in a context where the caller does not prevent transaction + * commits from happening, either by holding a transaction handle or holding + * some lock, while it's doing searches through a commit root. + * At the moment it's only used for send operations. + */ +static int finish_need_commit_sem_search(struct btrfs_path *path) +{ + const int i = path->lowest_level; + const int slot = path->slots[i]; + struct extent_buffer *lowest = path->nodes[i]; + struct extent_buffer *clone; + + ASSERT(path->need_commit_sem); + + if (!lowest) + return 0; + + lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); + + clone = btrfs_clone_extent_buffer(lowest); + if (!clone) + return -ENOMEM; + + btrfs_release_path(path); + path->nodes[i] = clone; + path->slots[i] = slot; + + return 0; +} + +static inline int search_for_key_slot(struct extent_buffer *eb, + int search_low_slot, + const struct btrfs_key *key, + int prev_cmp, + int *slot) +{ + /* + * If a previous call to btrfs_bin_search() on a parent node returned an + * exact match (prev_cmp == 0), we can safely assume the target key will + * always be at slot 0 on lower levels, since each key pointer + * (struct btrfs_key_ptr) refers to the lowest key accessible from the + * subtree it points to. Thus we can skip searching lower levels. + */ + if (prev_cmp == 0) { + *slot = 0; + return 0; + } + + return btrfs_bin_search(eb, search_low_slot, key, slot); +} + +static int search_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const struct btrfs_key *key, + struct btrfs_path *path, + int ins_len, + int prev_cmp) +{ + struct extent_buffer *leaf = path->nodes[0]; + int leaf_free_space = -1; + int search_low_slot = 0; + int ret; + bool do_bin_search = true; + + /* + * If we are doing an insertion, the leaf has enough free space and the + * destination slot for the key is not slot 0, then we can unlock our + * write lock on the parent, and any other upper nodes, before doing the + * binary search on the leaf (with search_for_key_slot()), allowing other + * tasks to lock the parent and any other upper nodes. + */ + if (ins_len > 0) { + /* + * Cache the leaf free space, since we will need it later and it + * will not change until then. + */ + leaf_free_space = btrfs_leaf_free_space(leaf); + + /* + * !path->locks[1] means we have a single node tree, the leaf is + * the root of the tree. + */ + if (path->locks[1] && leaf_free_space >= ins_len) { + struct btrfs_disk_key first_key; + + ASSERT(btrfs_header_nritems(leaf) > 0); + btrfs_item_key(leaf, &first_key, 0); + + /* + * Doing the extra comparison with the first key is cheap, + * taking into account that the first key is very likely + * already in a cache line because it immediately follows + * the extent buffer's header and we have recently accessed + * the header's level field. + */ + ret = comp_keys(&first_key, key); + if (ret < 0) { + /* + * The first key is smaller than the key we want + * to insert, so we are safe to unlock all upper + * nodes and we have to do the binary search. + * + * We do use btrfs_unlock_up_safe() and not + * unlock_up() because the later does not unlock + * nodes with a slot of 0 - we can safely unlock + * any node even if its slot is 0 since in this + * case the key does not end up at slot 0 of the + * leaf and there's no need to split the leaf. + */ + btrfs_unlock_up_safe(path, 1); + search_low_slot = 1; + } else { + /* + * The first key is >= then the key we want to + * insert, so we can skip the binary search as + * the target key will be at slot 0. + * + * We can not unlock upper nodes when the key is + * less than the first key, because we will need + * to update the key at slot 0 of the parent node + * and possibly of other upper nodes too. + * If the key matches the first key, then we can + * unlock all the upper nodes, using + * btrfs_unlock_up_safe() instead of unlock_up() + * as stated above. + */ + if (ret == 0) + btrfs_unlock_up_safe(path, 1); + /* + * ret is already 0 or 1, matching the result of + * a btrfs_bin_search() call, so there is no need + * to adjust it. + */ + do_bin_search = false; + path->slots[0] = 0; + } + } + } + + if (do_bin_search) { + ret = search_for_key_slot(leaf, search_low_slot, key, + prev_cmp, &path->slots[0]); + if (ret < 0) + return ret; + } + + if (ins_len > 0) { + /* + * Item key already exists. In this case, if we are allowed to + * insert the item (for example, in dir_item case, item key + * collision is allowed), it will be merged with the original + * item. Only the item size grows, no new btrfs item will be + * added. If search_for_extension is not set, ins_len already + * accounts the size btrfs_item, deduct it here so leaf space + * check will be correct. + */ + if (ret == 0 && !path->search_for_extension) { + ASSERT(ins_len >= sizeof(struct btrfs_item)); + ins_len -= sizeof(struct btrfs_item); + } + + ASSERT(leaf_free_space >= 0); + + if (leaf_free_space < ins_len) { + int err; + + err = split_leaf(trans, root, key, path, ins_len, + (ret == 0)); + ASSERT(err <= 0); + if (WARN_ON(err > 0)) + err = -EUCLEAN; + if (err) + ret = err; + } + } + + return ret; +} + +/* + * btrfs_search_slot - look for a key in a tree and perform necessary + * modifications to preserve tree invariants. + * + * @trans: Handle of transaction, used when modifying the tree + * @p: Holds all btree nodes along the search path + * @root: The root node of the tree + * @key: The key we are looking for + * @ins_len: Indicates purpose of search: + * >0 for inserts it's size of item inserted (*) + * <0 for deletions + * 0 for plain searches, not modifying the tree + * + * (*) If size of item inserted doesn't include + * sizeof(struct btrfs_item), then p->search_for_extension must + * be set. + * @cow: boolean should CoW operations be performed. Must always be 1 + * when modifying the tree. + * + * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. + * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) + * + * If @key is found, 0 is returned and you can find the item in the leaf level + * of the path (level 0) + * + * If @key isn't found, 1 is returned and the leaf level of the path (level 0) + * points to the slot where it should be inserted + * + * If an error is encountered while searching the tree a negative error number + * is returned + */ +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *key, struct btrfs_path *p, + int ins_len, int cow) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *b; + int slot; + int ret; + int err; + int level; + int lowest_unlock = 1; + /* everything at write_lock_level or lower must be write locked */ + int write_lock_level = 0; + u8 lowest_level = 0; + int min_write_lock_level; + int prev_cmp; + + might_sleep(); + + lowest_level = p->lowest_level; + WARN_ON(lowest_level && ins_len > 0); + WARN_ON(p->nodes[0] != NULL); + BUG_ON(!cow && ins_len); + + /* + * For now only allow nowait for read only operations. There's no + * strict reason why we can't, we just only need it for reads so it's + * only implemented for reads. + */ + ASSERT(!p->nowait || !cow); + + if (ins_len < 0) { + lowest_unlock = 2; + + /* when we are removing items, we might have to go up to level + * two as we update tree pointers Make sure we keep write + * for those levels as well + */ + write_lock_level = 2; + } else if (ins_len > 0) { + /* + * for inserting items, make sure we have a write lock on + * level 1 so we can update keys + */ + write_lock_level = 1; + } + + if (!cow) + write_lock_level = -1; + + if (cow && (p->keep_locks || p->lowest_level)) + write_lock_level = BTRFS_MAX_LEVEL; + + min_write_lock_level = write_lock_level; + + if (p->need_commit_sem) { + ASSERT(p->search_commit_root); + if (p->nowait) { + if (!down_read_trylock(&fs_info->commit_root_sem)) + return -EAGAIN; + } else { + down_read(&fs_info->commit_root_sem); + } + } + +again: + prev_cmp = -1; + b = btrfs_search_slot_get_root(root, p, write_lock_level); + if (IS_ERR(b)) { + ret = PTR_ERR(b); + goto done; + } + + while (b) { + int dec = 0; + + level = btrfs_header_level(b); + + if (cow) { + bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); + + /* + * if we don't really need to cow this block + * then we don't want to set the path blocking, + * so we test it here + */ + if (!should_cow_block(trans, root, b)) + goto cow_done; + + /* + * must have write locks on this node and the + * parent + */ + if (level > write_lock_level || + (level + 1 > write_lock_level && + level + 1 < BTRFS_MAX_LEVEL && + p->nodes[level + 1])) { + write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + + if (last_level) + err = btrfs_cow_block(trans, root, b, NULL, 0, + &b, + BTRFS_NESTING_COW); + else + err = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], &b, + BTRFS_NESTING_COW); + if (err) { + ret = err; + goto done; + } + } +cow_done: + p->nodes[level] = b; + + /* + * we have a lock on b and as long as we aren't changing + * the tree, there is no way to for the items in b to change. + * It is safe to drop the lock on our parent before we + * go through the expensive btree search on b. + * + * If we're inserting or deleting (ins_len != 0), then we might + * be changing slot zero, which may require changing the parent. + * So, we can't drop the lock until after we know which slot + * we're operating on. + */ + if (!ins_len && !p->keep_locks) { + int u = level + 1; + + if (u < BTRFS_MAX_LEVEL && p->locks[u]) { + btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); + p->locks[u] = 0; + } + } + + if (level == 0) { + if (ins_len > 0) + ASSERT(write_lock_level >= 1); + + ret = search_leaf(trans, root, key, p, ins_len, prev_cmp); + if (!p->search_for_split) + unlock_up(p, level, lowest_unlock, + min_write_lock_level, NULL); + goto done; + } + + ret = search_for_key_slot(b, 0, key, prev_cmp, &slot); + if (ret < 0) + goto done; + prev_cmp = ret; + + if (ret && slot > 0) { + dec = 1; + slot--; + } + p->slots[level] = slot; + err = setup_nodes_for_search(trans, root, p, b, level, ins_len, + &write_lock_level); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + b = p->nodes[level]; + slot = p->slots[level]; + + /* + * Slot 0 is special, if we change the key we have to update + * the parent pointer which means we must have a write lock on + * the parent + */ + if (slot == 0 && ins_len && write_lock_level < level + 1) { + write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + + unlock_up(p, level, lowest_unlock, min_write_lock_level, + &write_lock_level); + + if (level == lowest_level) { + if (dec) + p->slots[level]++; + goto done; + } + + err = read_block_for_search(root, p, &b, level, slot, key); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + + if (!p->skip_locking) { + level = btrfs_header_level(b); + + btrfs_maybe_reset_lockdep_class(root, b); + + if (level <= write_lock_level) { + btrfs_tree_lock(b); + p->locks[level] = BTRFS_WRITE_LOCK; + } else { + if (p->nowait) { + if (!btrfs_try_tree_read_lock(b)) { + free_extent_buffer(b); + ret = -EAGAIN; + goto done; + } + } else { + btrfs_tree_read_lock(b); + } + p->locks[level] = BTRFS_READ_LOCK; + } + p->nodes[level] = b; + } + } + ret = 1; +done: + if (ret < 0 && !p->skip_release_on_error) + btrfs_release_path(p); + + if (p->need_commit_sem) { + int ret2; + + ret2 = finish_need_commit_sem_search(p); + up_read(&fs_info->commit_root_sem); + if (ret2) + ret = ret2; + } + + return ret; +} +ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); + +/* + * Like btrfs_search_slot, this looks for a key in the given tree. It uses the + * current state of the tree together with the operations recorded in the tree + * modification log to search for the key in a previous version of this tree, as + * denoted by the time_seq parameter. + * + * Naturally, there is no support for insert, delete or cow operations. + * + * The resulting path and return value will be set up as if we called + * btrfs_search_slot at that point in time with ins_len and cow both set to 0. + */ +int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, + struct btrfs_path *p, u64 time_seq) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *b; + int slot; + int ret; + int err; + int level; + int lowest_unlock = 1; + u8 lowest_level = 0; + + lowest_level = p->lowest_level; + WARN_ON(p->nodes[0] != NULL); + ASSERT(!p->nowait); + + if (p->search_commit_root) { + BUG_ON(time_seq); + return btrfs_search_slot(NULL, root, key, p, 0, 0); + } + +again: + b = btrfs_get_old_root(root, time_seq); + if (!b) { + ret = -EIO; + goto done; + } + level = btrfs_header_level(b); + p->locks[level] = BTRFS_READ_LOCK; + + while (b) { + int dec = 0; + + level = btrfs_header_level(b); + p->nodes[level] = b; + + /* + * we have a lock on b and as long as we aren't changing + * the tree, there is no way to for the items in b to change. + * It is safe to drop the lock on our parent before we + * go through the expensive btree search on b. + */ + btrfs_unlock_up_safe(p, level + 1); + + ret = btrfs_bin_search(b, 0, key, &slot); + if (ret < 0) + goto done; + + if (level == 0) { + p->slots[level] = slot; + unlock_up(p, level, lowest_unlock, 0, NULL); + goto done; + } + + if (ret && slot > 0) { + dec = 1; + slot--; + } + p->slots[level] = slot; + unlock_up(p, level, lowest_unlock, 0, NULL); + + if (level == lowest_level) { + if (dec) + p->slots[level]++; + goto done; + } + + err = read_block_for_search(root, p, &b, level, slot, key); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + + level = btrfs_header_level(b); + btrfs_tree_read_lock(b); + b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); + if (!b) { + ret = -ENOMEM; + goto done; + } + p->locks[level] = BTRFS_READ_LOCK; + p->nodes[level] = b; + } + ret = 1; +done: + if (ret < 0) + btrfs_release_path(p); + + return ret; +} + +/* + * Search the tree again to find a leaf with smaller keys. + * Returns 0 if it found something. + * Returns 1 if there are no smaller keys. + * Returns < 0 on error. + * + * This may release the path, and so you may lose any locks held at the + * time you call it. + */ +static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + struct btrfs_key key; + struct btrfs_key orig_key; + struct btrfs_disk_key found_key; + int ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, 0); + orig_key = key; + + if (key.offset > 0) { + key.offset--; + } else if (key.type > 0) { + key.type--; + key.offset = (u64)-1; + } else if (key.objectid > 0) { + key.objectid--; + key.type = (u8)-1; + key.offset = (u64)-1; + } else { + return 1; + } + + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret <= 0) + return ret; + + /* + * Previous key not found. Even if we were at slot 0 of the leaf we had + * before releasing the path and calling btrfs_search_slot(), we now may + * be in a slot pointing to the same original key - this can happen if + * after we released the path, one of more items were moved from a + * sibling leaf into the front of the leaf we had due to an insertion + * (see push_leaf_right()). + * If we hit this case and our slot is > 0 and just decrement the slot + * so that the caller does not process the same key again, which may or + * may not break the caller, depending on its logic. + */ + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); + ret = comp_keys(&found_key, &orig_key); + if (ret == 0) { + if (path->slots[0] > 0) { + path->slots[0]--; + return 0; + } + /* + * At slot 0, same key as before, it means orig_key is + * the lowest, leftmost, key in the tree. We're done. + */ + return 1; + } + } + + btrfs_item_key(path->nodes[0], &found_key, 0); + ret = comp_keys(&found_key, &key); + /* + * We might have had an item with the previous key in the tree right + * before we released our path. And after we released our path, that + * item might have been pushed to the first slot (0) of the leaf we + * were holding due to a tree balance. Alternatively, an item with the + * previous key can exist as the only element of a leaf (big fat item). + * Therefore account for these 2 cases, so that our callers (like + * btrfs_previous_item) don't miss an existing item with a key matching + * the previous key we computed above. + */ + if (ret <= 0) + return 0; + return 1; +} + +/* + * helper to use instead of search slot if no exact match is needed but + * instead the next or previous item should be returned. + * When find_higher is true, the next higher item is returned, the next lower + * otherwise. + * When return_any and find_higher are both true, and no higher item is found, + * return the next lower instead. + * When return_any is true and find_higher is false, and no lower item is found, + * return the next higher instead. + * It returns 0 if any item is found, 1 if none is found (tree empty), and + * < 0 on error + */ +int btrfs_search_slot_for_read(struct btrfs_root *root, + const struct btrfs_key *key, + struct btrfs_path *p, int find_higher, + int return_any) +{ + int ret; + struct extent_buffer *leaf; + +again: + ret = btrfs_search_slot(NULL, root, key, p, 0, 0); + if (ret <= 0) + return ret; + /* + * a return value of 1 means the path is at the position where the + * item should be inserted. Normally this is the next bigger item, + * but in case the previous item is the last in a leaf, path points + * to the first free slot in the previous leaf, i.e. at an invalid + * item. + */ + leaf = p->nodes[0]; + + if (find_higher) { + if (p->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, p); + if (ret <= 0) + return ret; + if (!return_any) + return 1; + /* + * no higher item found, return the next + * lower instead + */ + return_any = 0; + find_higher = 0; + btrfs_release_path(p); + goto again; + } + } else { + if (p->slots[0] == 0) { + ret = btrfs_prev_leaf(root, p); + if (ret < 0) + return ret; + if (!ret) { + leaf = p->nodes[0]; + if (p->slots[0] == btrfs_header_nritems(leaf)) + p->slots[0]--; + return 0; + } + if (!return_any) + return 1; + /* + * no lower item found, return the next + * higher instead + */ + return_any = 0; + find_higher = 1; + btrfs_release_path(p); + goto again; + } else { + --p->slots[0]; + } + } + return 0; +} + +/* + * Execute search and call btrfs_previous_item to traverse backwards if the item + * was not found. + * + * Return 0 if found, 1 if not found and < 0 if error. + */ +int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path) +{ + int ret; + + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret > 0) + ret = btrfs_previous_item(root, path, key->objectid, key->type); + + if (ret == 0) + btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); + + return ret; +} + +/* + * Search for a valid slot for the given path. + * + * @root: The root node of the tree. + * @key: Will contain a valid item if found. + * @path: The starting point to validate the slot. + * + * Return: 0 if the item is valid + * 1 if not found + * <0 if error. + */ +int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path) +{ + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + int ret; + + ret = btrfs_next_leaf(root, path); + if (ret) + return ret; + } + + btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); + return 0; +} + +/* + * adjust the pointers going up the tree, starting at level + * making sure the right key of each node is points to 'key'. + * This is used after shifting pointers to the left, so it stops + * fixing up pointers when a given leaf/node is not in slot 0 of the + * higher levels + * + */ +static void fixup_low_keys(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_disk_key *key, int level) +{ + int i; + struct extent_buffer *t; + int ret; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + int tslot = path->slots[i]; + + if (!path->nodes[i]) + break; + t = path->nodes[i]; + ret = btrfs_tree_mod_log_insert_key(t, tslot, + BTRFS_MOD_LOG_KEY_REPLACE); + BUG_ON(ret < 0); + btrfs_set_node_key(t, key, tslot); + btrfs_mark_buffer_dirty(trans, path->nodes[i]); + if (tslot != 0) + break; + } +} + +/* + * update item key. + * + * This function isn't completely safe. It's the caller's responsibility + * that the new key won't break the order + */ +void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + const struct btrfs_key *new_key) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_disk_key disk_key; + struct extent_buffer *eb; + int slot; + + eb = path->nodes[0]; + slot = path->slots[0]; + if (slot > 0) { + btrfs_item_key(eb, &disk_key, slot - 1); + if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { + btrfs_print_leaf(eb); + btrfs_crit(fs_info, + "slot %u key (%llu %u %llu) new key (%llu %u %llu)", + slot, btrfs_disk_key_objectid(&disk_key), + btrfs_disk_key_type(&disk_key), + btrfs_disk_key_offset(&disk_key), + new_key->objectid, new_key->type, + new_key->offset); + BUG(); + } + } + if (slot < btrfs_header_nritems(eb) - 1) { + btrfs_item_key(eb, &disk_key, slot + 1); + if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { + btrfs_print_leaf(eb); + btrfs_crit(fs_info, + "slot %u key (%llu %u %llu) new key (%llu %u %llu)", + slot, btrfs_disk_key_objectid(&disk_key), + btrfs_disk_key_type(&disk_key), + btrfs_disk_key_offset(&disk_key), + new_key->objectid, new_key->type, + new_key->offset); + BUG(); + } + } + + btrfs_cpu_key_to_disk(&disk_key, new_key); + btrfs_set_item_key(eb, &disk_key, slot); + btrfs_mark_buffer_dirty(trans, eb); + if (slot == 0) + fixup_low_keys(trans, path, &disk_key, 1); +} + +/* + * Check key order of two sibling extent buffers. + * + * Return true if something is wrong. + * Return false if everything is fine. + * + * Tree-checker only works inside one tree block, thus the following + * corruption can not be detected by tree-checker: + * + * Leaf @left | Leaf @right + * -------------------------------------------------------------- + * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | + * + * Key f6 in leaf @left itself is valid, but not valid when the next + * key in leaf @right is 7. + * This can only be checked at tree block merge time. + * And since tree checker has ensured all key order in each tree block + * is correct, we only need to bother the last key of @left and the first + * key of @right. + */ +static bool check_sibling_keys(struct extent_buffer *left, + struct extent_buffer *right) +{ + struct btrfs_key left_last; + struct btrfs_key right_first; + int level = btrfs_header_level(left); + int nr_left = btrfs_header_nritems(left); + int nr_right = btrfs_header_nritems(right); + + /* No key to check in one of the tree blocks */ + if (!nr_left || !nr_right) + return false; + + if (level) { + btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); + btrfs_node_key_to_cpu(right, &right_first, 0); + } else { + btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); + btrfs_item_key_to_cpu(right, &right_first, 0); + } + + if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) { + btrfs_crit(left->fs_info, "left extent buffer:"); + btrfs_print_tree(left, false); + btrfs_crit(left->fs_info, "right extent buffer:"); + btrfs_print_tree(right, false); + btrfs_crit(left->fs_info, +"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", + left_last.objectid, left_last.type, + left_last.offset, right_first.objectid, + right_first.type, right_first.offset); + return true; + } + return false; +} + +/* + * try to push data from one node into the next node left in the + * tree. + * + * returns 0 if some ptrs were pushed left, < 0 if there was some horrible + * error, and > 0 if there was no room in the left hand block. + */ +static int push_node_left(struct btrfs_trans_handle *trans, + struct extent_buffer *dst, + struct extent_buffer *src, int empty) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int push_items = 0; + int src_nritems; + int dst_nritems; + int ret = 0; + + src_nritems = btrfs_header_nritems(src); + dst_nritems = btrfs_header_nritems(dst); + push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; + WARN_ON(btrfs_header_generation(src) != trans->transid); + WARN_ON(btrfs_header_generation(dst) != trans->transid); + + if (!empty && src_nritems <= 8) + return 1; + + if (push_items <= 0) + return 1; + + if (empty) { + push_items = min(src_nritems, push_items); + if (push_items < src_nritems) { + /* leave at least 8 pointers in the node if + * we aren't going to empty it + */ + if (src_nritems - push_items < 8) { + if (push_items <= 8) + return 1; + push_items -= 8; + } + } + } else + push_items = min(src_nritems - 8, push_items); + + /* dst is the left eb, src is the middle eb */ + if (check_sibling_keys(dst, src)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } + ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + copy_extent_buffer(dst, src, + btrfs_node_key_ptr_offset(dst, dst_nritems), + btrfs_node_key_ptr_offset(src, 0), + push_items * sizeof(struct btrfs_key_ptr)); + + if (push_items < src_nritems) { + /* + * btrfs_tree_mod_log_eb_copy handles logging the move, so we + * don't need to do an explicit tree mod log operation for it. + */ + memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), + btrfs_node_key_ptr_offset(src, push_items), + (src_nritems - push_items) * + sizeof(struct btrfs_key_ptr)); + } + btrfs_set_header_nritems(src, src_nritems - push_items); + btrfs_set_header_nritems(dst, dst_nritems + push_items); + btrfs_mark_buffer_dirty(trans, src); + btrfs_mark_buffer_dirty(trans, dst); + + return ret; +} + +/* + * try to push data from one node into the next node right in the + * tree. + * + * returns 0 if some ptrs were pushed, < 0 if there was some horrible + * error, and > 0 if there was no room in the right hand block. + * + * this will only push up to 1/2 the contents of the left node over + */ +static int balance_node_right(struct btrfs_trans_handle *trans, + struct extent_buffer *dst, + struct extent_buffer *src) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int push_items = 0; + int max_push; + int src_nritems; + int dst_nritems; + int ret = 0; + + WARN_ON(btrfs_header_generation(src) != trans->transid); + WARN_ON(btrfs_header_generation(dst) != trans->transid); + + src_nritems = btrfs_header_nritems(src); + dst_nritems = btrfs_header_nritems(dst); + push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; + if (push_items <= 0) + return 1; + + if (src_nritems < 4) + return 1; + + max_push = src_nritems / 2 + 1; + /* don't try to empty the node */ + if (max_push >= src_nritems) + return 1; + + if (max_push < push_items) + push_items = max_push; + + /* dst is the right eb, src is the middle eb */ + if (check_sibling_keys(src, dst)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } + + /* + * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't + * need to do an explicit tree mod log operation for it. + */ + memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), + btrfs_node_key_ptr_offset(dst, 0), + (dst_nritems) * + sizeof(struct btrfs_key_ptr)); + + ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, + push_items); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + copy_extent_buffer(dst, src, + btrfs_node_key_ptr_offset(dst, 0), + btrfs_node_key_ptr_offset(src, src_nritems - push_items), + push_items * sizeof(struct btrfs_key_ptr)); + + btrfs_set_header_nritems(src, src_nritems - push_items); + btrfs_set_header_nritems(dst, dst_nritems + push_items); + + btrfs_mark_buffer_dirty(trans, src); + btrfs_mark_buffer_dirty(trans, dst); + + return ret; +} + +/* + * helper function to insert a new root level in the tree. + * A new node is allocated, and a single item is inserted to + * point to the existing root + * + * returns zero on success or < 0 on failure. + */ +static noinline int insert_new_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 lower_gen; + struct extent_buffer *lower; + struct extent_buffer *c; + struct extent_buffer *old; + struct btrfs_disk_key lower_key; + int ret; + + BUG_ON(path->nodes[level]); + BUG_ON(path->nodes[level-1] != root->node); + + lower = path->nodes[level-1]; + if (level == 1) + btrfs_item_key(lower, &lower_key, 0); + else + btrfs_node_key(lower, &lower_key, 0); + + c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &lower_key, level, root->node->start, 0, + BTRFS_NESTING_NEW_ROOT); + if (IS_ERR(c)) + return PTR_ERR(c); + + root_add_used(root, fs_info->nodesize); + + btrfs_set_header_nritems(c, 1); + btrfs_set_node_key(c, &lower_key, 0); + btrfs_set_node_blockptr(c, 0, lower->start); + lower_gen = btrfs_header_generation(lower); + WARN_ON(lower_gen != trans->transid); + + btrfs_set_node_ptr_generation(c, 0, lower_gen); + + btrfs_mark_buffer_dirty(trans, c); + + old = root->node; + ret = btrfs_tree_mod_log_insert_root(root->node, c, false); + if (ret < 0) { + btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); + btrfs_tree_unlock(c); + free_extent_buffer(c); + return ret; + } + rcu_assign_pointer(root->node, c); + + /* the super has an extra ref to root->node */ + free_extent_buffer(old); + + add_root_to_dirty_list(root); + atomic_inc(&c->refs); + path->nodes[level] = c; + path->locks[level] = BTRFS_WRITE_LOCK; + path->slots[level] = 0; + return 0; +} + +/* + * worker function to insert a single pointer in a node. + * the node should have enough room for the pointer already + * + * slot and level indicate where you want the key to go, and + * blocknr is the block the key points to. + */ +static int insert_ptr(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_disk_key *key, u64 bytenr, + int slot, int level) +{ + struct extent_buffer *lower; + int nritems; + int ret; + + BUG_ON(!path->nodes[level]); + btrfs_assert_tree_write_locked(path->nodes[level]); + lower = path->nodes[level]; + nritems = btrfs_header_nritems(lower); + BUG_ON(slot > nritems); + BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); + if (slot != nritems) { + if (level) { + ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, + slot, nritems - slot); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + memmove_extent_buffer(lower, + btrfs_node_key_ptr_offset(lower, slot + 1), + btrfs_node_key_ptr_offset(lower, slot), + (nritems - slot) * sizeof(struct btrfs_key_ptr)); + } + if (level) { + ret = btrfs_tree_mod_log_insert_key(lower, slot, + BTRFS_MOD_LOG_KEY_ADD); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + btrfs_set_node_key(lower, key, slot); + btrfs_set_node_blockptr(lower, slot, bytenr); + WARN_ON(trans->transid == 0); + btrfs_set_node_ptr_generation(lower, slot, trans->transid); + btrfs_set_header_nritems(lower, nritems + 1); + btrfs_mark_buffer_dirty(trans, lower); + + return 0; +} + +/* + * split the node at the specified level in path in two. + * The path is corrected to point to the appropriate node after the split + * + * Before splitting this tries to make some room in the node by pushing + * left and right, if either one works, it returns right away. + * + * returns 0 on success and < 0 on failure + */ +static noinline int split_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *c; + struct extent_buffer *split; + struct btrfs_disk_key disk_key; + int mid; + int ret; + u32 c_nritems; + + c = path->nodes[level]; + WARN_ON(btrfs_header_generation(c) != trans->transid); + if (c == root->node) { + /* + * trying to split the root, lets make a new one + * + * tree mod log: We don't log_removal old root in + * insert_new_root, because that root buffer will be kept as a + * normal node. We are going to log removal of half of the + * elements below with btrfs_tree_mod_log_eb_copy(). We're + * holding a tree lock on the buffer, which is why we cannot + * race with other tree_mod_log users. + */ + ret = insert_new_root(trans, root, path, level + 1); + if (ret) + return ret; + } else { + ret = push_nodes_for_insert(trans, root, path, level); + c = path->nodes[level]; + if (!ret && btrfs_header_nritems(c) < + BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) + return 0; + if (ret < 0) + return ret; + } + + c_nritems = btrfs_header_nritems(c); + mid = (c_nritems + 1) / 2; + btrfs_node_key(c, &disk_key, mid); + + split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &disk_key, level, c->start, 0, + BTRFS_NESTING_SPLIT); + if (IS_ERR(split)) + return PTR_ERR(split); + + root_add_used(root, fs_info->nodesize); + ASSERT(btrfs_header_level(c) == level); + + ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); + if (ret) { + btrfs_tree_unlock(split); + free_extent_buffer(split); + btrfs_abort_transaction(trans, ret); + return ret; + } + copy_extent_buffer(split, c, + btrfs_node_key_ptr_offset(split, 0), + btrfs_node_key_ptr_offset(c, mid), + (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); + btrfs_set_header_nritems(split, c_nritems - mid); + btrfs_set_header_nritems(c, mid); + + btrfs_mark_buffer_dirty(trans, c); + btrfs_mark_buffer_dirty(trans, split); + + ret = insert_ptr(trans, path, &disk_key, split->start, + path->slots[level + 1] + 1, level + 1); + if (ret < 0) { + btrfs_tree_unlock(split); + free_extent_buffer(split); + return ret; + } + + if (path->slots[level] >= mid) { + path->slots[level] -= mid; + btrfs_tree_unlock(c); + free_extent_buffer(c); + path->nodes[level] = split; + path->slots[level + 1] += 1; + } else { + btrfs_tree_unlock(split); + free_extent_buffer(split); + } + return 0; +} + +/* + * how many bytes are required to store the items in a leaf. start + * and nr indicate which items in the leaf to check. This totals up the + * space used both by the item structs and the item data + */ +static int leaf_space_used(const struct extent_buffer *l, int start, int nr) +{ + int data_len; + int nritems = btrfs_header_nritems(l); + int end = min(nritems, start + nr) - 1; + + if (!nr) + return 0; + data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start); + data_len = data_len - btrfs_item_offset(l, end); + data_len += sizeof(struct btrfs_item) * nr; + WARN_ON(data_len < 0); + return data_len; +} + +/* + * The space between the end of the leaf items and + * the start of the leaf data. IOW, how much room + * the leaf has left for both items and data + */ +int btrfs_leaf_free_space(const struct extent_buffer *leaf) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + int nritems = btrfs_header_nritems(leaf); + int ret; + + ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); + if (ret < 0) { + btrfs_crit(fs_info, + "leaf free space ret %d, leaf data size %lu, used %d nritems %d", + ret, + (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), + leaf_space_used(leaf, 0, nritems), nritems); + } + return ret; +} + +/* + * min slot controls the lowest index we're willing to push to the + * right. We'll push up to and including min_slot, but no lower + */ +static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + int data_size, int empty, + struct extent_buffer *right, + int free_space, u32 left_nritems, + u32 min_slot) +{ + struct btrfs_fs_info *fs_info = right->fs_info; + struct extent_buffer *left = path->nodes[0]; + struct extent_buffer *upper = path->nodes[1]; + struct btrfs_map_token token; + struct btrfs_disk_key disk_key; + int slot; + u32 i; + int push_space = 0; + int push_items = 0; + u32 nr; + u32 right_nritems; + u32 data_end; + u32 this_item_size; + + if (empty) + nr = 0; + else + nr = max_t(u32, 1, min_slot); + + if (path->slots[0] >= left_nritems) + push_space += data_size; + + slot = path->slots[1]; + i = left_nritems - 1; + while (i >= nr) { + if (!empty && push_items > 0) { + if (path->slots[0] > i) + break; + if (path->slots[0] == i) { + int space = btrfs_leaf_free_space(left); + + if (space + push_space * 2 > free_space) + break; + } + } + + if (path->slots[0] == i) + push_space += data_size; + + this_item_size = btrfs_item_size(left, i); + if (this_item_size + sizeof(struct btrfs_item) + + push_space > free_space) + break; + + push_items++; + push_space += this_item_size + sizeof(struct btrfs_item); + if (i == 0) + break; + i--; + } + + if (push_items == 0) + goto out_unlock; + + WARN_ON(!empty && push_items == left_nritems); + + /* push left to right */ + right_nritems = btrfs_header_nritems(right); + + push_space = btrfs_item_data_end(left, left_nritems - push_items); + push_space -= leaf_data_end(left); + + /* make room in the right data area */ + data_end = leaf_data_end(right); + memmove_leaf_data(right, data_end - push_space, data_end, + BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); + + /* copy from the left data area */ + copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, + leaf_data_end(left), push_space); + + memmove_leaf_items(right, push_items, 0, right_nritems); + + /* copy the items from left to right */ + copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); + + /* update the item pointers */ + btrfs_init_map_token(&token, right); + right_nritems += push_items; + btrfs_set_header_nritems(right, right_nritems); + push_space = BTRFS_LEAF_DATA_SIZE(fs_info); + for (i = 0; i < right_nritems; i++) { + push_space -= btrfs_token_item_size(&token, i); + btrfs_set_token_item_offset(&token, i, push_space); + } + + left_nritems -= push_items; + btrfs_set_header_nritems(left, left_nritems); + + if (left_nritems) + btrfs_mark_buffer_dirty(trans, left); + else + btrfs_clear_buffer_dirty(trans, left); + + btrfs_mark_buffer_dirty(trans, right); + + btrfs_item_key(right, &disk_key, 0); + btrfs_set_node_key(upper, &disk_key, slot + 1); + btrfs_mark_buffer_dirty(trans, upper); + + /* then fixup the leaf pointer in the path */ + if (path->slots[0] >= left_nritems) { + path->slots[0] -= left_nritems; + if (btrfs_header_nritems(path->nodes[0]) == 0) + btrfs_clear_buffer_dirty(trans, path->nodes[0]); + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[1] += 1; + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 0; + +out_unlock: + btrfs_tree_unlock(right); + free_extent_buffer(right); + return 1; +} + +/* + * push some data in the path leaf to the right, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * returns 1 if the push failed because the other node didn't have enough + * room, 0 if everything worked out and < 0 if there were major errors. + * + * this will push starting from min_slot to the end of the leaf. It won't + * push any slot lower than min_slot + */ +static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + int min_data_size, int data_size, + int empty, u32 min_slot) +{ + struct extent_buffer *left = path->nodes[0]; + struct extent_buffer *right; + struct extent_buffer *upper; + int slot; + int free_space; + u32 left_nritems; + int ret; + + if (!path->nodes[1]) + return 1; + + slot = path->slots[1]; + upper = path->nodes[1]; + if (slot >= btrfs_header_nritems(upper) - 1) + return 1; + + btrfs_assert_tree_write_locked(path->nodes[1]); + + right = btrfs_read_node_slot(upper, slot + 1); + if (IS_ERR(right)) + return PTR_ERR(right); + + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); + + free_space = btrfs_leaf_free_space(right); + if (free_space < data_size) + goto out_unlock; + + ret = btrfs_cow_block(trans, root, right, upper, + slot + 1, &right, BTRFS_NESTING_RIGHT_COW); + if (ret) + goto out_unlock; + + left_nritems = btrfs_header_nritems(left); + if (left_nritems == 0) + goto out_unlock; + + if (check_sibling_keys(left, right)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } + if (path->slots[0] == left_nritems && !empty) { + /* Key greater than all keys in the leaf, right neighbor has + * enough room for it and we're not emptying our leaf to delete + * it, therefore use right neighbor to insert the new item and + * no need to touch/dirty our left leaf. */ + btrfs_tree_unlock(left); + free_extent_buffer(left); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1]++; + return 0; + } + + return __push_leaf_right(trans, path, min_data_size, empty, right, + free_space, left_nritems, min_slot); +out_unlock: + btrfs_tree_unlock(right); + free_extent_buffer(right); + return 1; +} + +/* + * push some data in the path leaf to the left, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the + * items + */ +static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int data_size, + int empty, struct extent_buffer *left, + int free_space, u32 right_nritems, + u32 max_slot) +{ + struct btrfs_fs_info *fs_info = left->fs_info; + struct btrfs_disk_key disk_key; + struct extent_buffer *right = path->nodes[0]; + int i; + int push_space = 0; + int push_items = 0; + u32 old_left_nritems; + u32 nr; + int ret = 0; + u32 this_item_size; + u32 old_left_item_size; + struct btrfs_map_token token; + + if (empty) + nr = min(right_nritems, max_slot); + else + nr = min(right_nritems - 1, max_slot); + + for (i = 0; i < nr; i++) { + if (!empty && push_items > 0) { + if (path->slots[0] < i) + break; + if (path->slots[0] == i) { + int space = btrfs_leaf_free_space(right); + + if (space + push_space * 2 > free_space) + break; + } + } + + if (path->slots[0] == i) + push_space += data_size; + + this_item_size = btrfs_item_size(right, i); + if (this_item_size + sizeof(struct btrfs_item) + push_space > + free_space) + break; + + push_items++; + push_space += this_item_size + sizeof(struct btrfs_item); + } + + if (push_items == 0) { + ret = 1; + goto out; + } + WARN_ON(!empty && push_items == btrfs_header_nritems(right)); + + /* push data from right to left */ + copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items); + + push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - + btrfs_item_offset(right, push_items - 1); + + copy_leaf_data(left, right, leaf_data_end(left) - push_space, + btrfs_item_offset(right, push_items - 1), push_space); + old_left_nritems = btrfs_header_nritems(left); + BUG_ON(old_left_nritems <= 0); + + btrfs_init_map_token(&token, left); + old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); + for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { + u32 ioff; + + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, + ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); + } + btrfs_set_header_nritems(left, old_left_nritems + push_items); + + /* fixup right node */ + if (push_items > right_nritems) + WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, + right_nritems); + + if (push_items < right_nritems) { + push_space = btrfs_item_offset(right, push_items - 1) - + leaf_data_end(right); + memmove_leaf_data(right, + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, + leaf_data_end(right), push_space); + + memmove_leaf_items(right, 0, push_items, + btrfs_header_nritems(right) - push_items); + } + + btrfs_init_map_token(&token, right); + right_nritems -= push_items; + btrfs_set_header_nritems(right, right_nritems); + push_space = BTRFS_LEAF_DATA_SIZE(fs_info); + for (i = 0; i < right_nritems; i++) { + push_space = push_space - btrfs_token_item_size(&token, i); + btrfs_set_token_item_offset(&token, i, push_space); + } + + btrfs_mark_buffer_dirty(trans, left); + if (right_nritems) + btrfs_mark_buffer_dirty(trans, right); + else + btrfs_clear_buffer_dirty(trans, right); + + btrfs_item_key(right, &disk_key, 0); + fixup_low_keys(trans, path, &disk_key, 1); + + /* then fixup the leaf pointer in the path */ + if (path->slots[0] < push_items) { + path->slots[0] += old_left_nritems; + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = left; + path->slots[1] -= 1; + } else { + btrfs_tree_unlock(left); + free_extent_buffer(left); + path->slots[0] -= push_items; + } + BUG_ON(path->slots[0] < 0); + return ret; +out: + btrfs_tree_unlock(left); + free_extent_buffer(left); + return ret; +} + +/* + * push some data in the path leaf to the left, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the + * items + */ +static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int min_data_size, + int data_size, int empty, u32 max_slot) +{ + struct extent_buffer *right = path->nodes[0]; + struct extent_buffer *left; + int slot; + int free_space; + u32 right_nritems; + int ret = 0; + + slot = path->slots[1]; + if (slot == 0) + return 1; + if (!path->nodes[1]) + return 1; + + right_nritems = btrfs_header_nritems(right); + if (right_nritems == 0) + return 1; + + btrfs_assert_tree_write_locked(path->nodes[1]); + + left = btrfs_read_node_slot(path->nodes[1], slot - 1); + if (IS_ERR(left)) + return PTR_ERR(left); + + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); + + free_space = btrfs_leaf_free_space(left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + ret = btrfs_cow_block(trans, root, left, + path->nodes[1], slot - 1, &left, + BTRFS_NESTING_LEFT_COW); + if (ret) { + /* we hit -ENOSPC, but it isn't fatal here */ + if (ret == -ENOSPC) + ret = 1; + goto out; + } + + if (check_sibling_keys(left, right)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + goto out; + } + return __push_leaf_left(trans, path, min_data_size, empty, left, + free_space, right_nritems, max_slot); +out: + btrfs_tree_unlock(left); + free_extent_buffer(left); + return ret; +} + +/* + * split the path's leaf in two, making sure there is at least data_size + * available for the resulting leaf level of the path. + */ +static noinline int copy_for_split(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct extent_buffer *l, + struct extent_buffer *right, + int slot, int mid, int nritems) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int data_copy_size; + int rt_data_off; + int i; + int ret; + struct btrfs_disk_key disk_key; + struct btrfs_map_token token; + + nritems = nritems - mid; + btrfs_set_header_nritems(right, nritems); + data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); + + copy_leaf_items(right, l, 0, mid, nritems); + + copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, + leaf_data_end(l), data_copy_size); + + rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); + + btrfs_init_map_token(&token, right); + for (i = 0; i < nritems; i++) { + u32 ioff; + + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff + rt_data_off); + } + + btrfs_set_header_nritems(l, mid); + btrfs_item_key(right, &disk_key, 0); + ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); + if (ret < 0) + return ret; + + btrfs_mark_buffer_dirty(trans, right); + btrfs_mark_buffer_dirty(trans, l); + BUG_ON(path->slots[0] != slot); + + if (mid <= slot) { + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] -= mid; + path->slots[1] += 1; + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + + BUG_ON(path->slots[0] < 0); + + return 0; +} + +/* + * double splits happen when we need to insert a big item in the middle + * of a leaf. A double split can leave us with 3 mostly empty leaves: + * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] + * A B C + * + * We avoid this by trying to push the items on either side of our target + * into the adjacent leaves. If all goes well we can avoid the double split + * completely. + */ +static noinline int push_for_double_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int data_size) +{ + int ret; + int progress = 0; + int slot; + u32 nritems; + int space_needed = data_size; + + slot = path->slots[0]; + if (slot < btrfs_header_nritems(path->nodes[0])) + space_needed -= btrfs_leaf_free_space(path->nodes[0]); + + /* + * try to push all the items after our slot into the + * right leaf + */ + ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + nritems = btrfs_header_nritems(path->nodes[0]); + /* + * our goal is to get our slot at the start or end of a leaf. If + * we've done so we're done + */ + if (path->slots[0] == 0 || path->slots[0] == nritems) + return 0; + + if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) + return 0; + + /* try to push all the items before our slot into the next leaf */ + slot = path->slots[0]; + space_needed = data_size; + if (slot > 0) + space_needed -= btrfs_leaf_free_space(path->nodes[0]); + ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + if (progress) + return 0; + return 1; +} + +/* + * split the path's leaf in two, making sure there is at least data_size + * available for the resulting leaf level of the path. + * + * returns 0 if all went well and < 0 on failure. + */ +static noinline int split_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const struct btrfs_key *ins_key, + struct btrfs_path *path, int data_size, + int extend) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *l; + u32 nritems; + int mid; + int slot; + struct extent_buffer *right; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; + int wret; + int split; + int num_doubles = 0; + int tried_avoid_double = 0; + + l = path->nodes[0]; + slot = path->slots[0]; + if (extend && data_size + btrfs_item_size(l, slot) + + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) + return -EOVERFLOW; + + /* first try to make some room by pushing left and right */ + if (data_size && path->nodes[1]) { + int space_needed = data_size; + + if (slot < btrfs_header_nritems(l)) + space_needed -= btrfs_leaf_free_space(l); + + wret = push_leaf_right(trans, root, path, space_needed, + space_needed, 0, 0); + if (wret < 0) + return wret; + if (wret) { + space_needed = data_size; + if (slot > 0) + space_needed -= btrfs_leaf_free_space(l); + wret = push_leaf_left(trans, root, path, space_needed, + space_needed, 0, (u32)-1); + if (wret < 0) + return wret; + } + l = path->nodes[0]; + + /* did the pushes work? */ + if (btrfs_leaf_free_space(l) >= data_size) + return 0; + } + + if (!path->nodes[1]) { + ret = insert_new_root(trans, root, path, 1); + if (ret) + return ret; + } +again: + split = 1; + l = path->nodes[0]; + slot = path->slots[0]; + nritems = btrfs_header_nritems(l); + mid = (nritems + 1) / 2; + + if (mid <= slot) { + if (nritems == 1 || + leaf_space_used(l, mid, nritems - mid) + data_size > + BTRFS_LEAF_DATA_SIZE(fs_info)) { + if (slot >= nritems) { + split = 0; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { + if (data_size && !tried_avoid_double) + goto push_for_double; + split = 2; + } + } + } + } else { + if (leaf_space_used(l, 0, mid) + data_size > + BTRFS_LEAF_DATA_SIZE(fs_info)) { + if (!extend && data_size && slot == 0) { + split = 0; + } else if ((extend || !data_size) && slot == 0) { + mid = 1; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { + if (data_size && !tried_avoid_double) + goto push_for_double; + split = 2; + } + } + } + } + + if (split == 0) + btrfs_cpu_key_to_disk(&disk_key, ins_key); + else + btrfs_item_key(l, &disk_key, mid); + + /* + * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double + * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES + * subclasses, which is 8 at the time of this patch, and we've maxed it + * out. In the future we could add a + * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just + * use BTRFS_NESTING_NEW_ROOT. + */ + right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &disk_key, 0, l->start, 0, + num_doubles ? BTRFS_NESTING_NEW_ROOT : + BTRFS_NESTING_SPLIT); + if (IS_ERR(right)) + return PTR_ERR(right); + + root_add_used(root, fs_info->nodesize); + + if (split == 0) { + if (mid <= slot) { + btrfs_set_header_nritems(right, 0); + ret = insert_ptr(trans, path, &disk_key, + right->start, path->slots[1] + 1, 1); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1] += 1; + } else { + btrfs_set_header_nritems(right, 0); + ret = insert_ptr(trans, path, &disk_key, + right->start, path->slots[1], 1); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + if (path->slots[1] == 0) + fixup_low_keys(trans, path, &disk_key, 1); + } + /* + * We create a new leaf 'right' for the required ins_len and + * we'll do btrfs_mark_buffer_dirty() on this leaf after copying + * the content of ins_len to 'right'. + */ + return ret; + } + + ret = copy_for_split(trans, path, l, right, slot, mid, nritems); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } + + if (split == 2) { + BUG_ON(num_doubles != 0); + num_doubles++; + goto again; + } + + return 0; + +push_for_double: + push_for_double_split(trans, root, path, data_size); + tried_avoid_double = 1; + if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) + return 0; + goto again; +} + +static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int ins_len) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + u64 extent_len = 0; + u32 item_size; + int ret; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && + key.type != BTRFS_EXTENT_CSUM_KEY); + + if (btrfs_leaf_free_space(leaf) >= ins_len) + return 0; + + item_size = btrfs_item_size(leaf, path->slots[0]); + if (key.type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_len = btrfs_file_extent_num_bytes(leaf, fi); + } + btrfs_release_path(path); + + path->keep_locks = 1; + path->search_for_split = 1; + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + path->search_for_split = 0; + if (ret > 0) + ret = -EAGAIN; + if (ret < 0) + goto err; + + ret = -EAGAIN; + leaf = path->nodes[0]; + /* if our item isn't there, return now */ + if (item_size != btrfs_item_size(leaf, path->slots[0])) + goto err; + + /* the leaf has changed, it now has room. return now */ + if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) + goto err; + + if (key.type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) + goto err; + } + + ret = split_leaf(trans, root, &key, path, ins_len, 1); + if (ret) + goto err; + + path->keep_locks = 0; + btrfs_unlock_up_safe(path, 1); + return 0; +err: + path->keep_locks = 0; + return ret; +} + +static noinline int split_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + const struct btrfs_key *new_key, + unsigned long split_offset) +{ + struct extent_buffer *leaf; + int orig_slot, slot; + char *buf; + u32 nritems; + u32 item_size; + u32 orig_offset; + struct btrfs_disk_key disk_key; + + leaf = path->nodes[0]; + /* + * Shouldn't happen because the caller must have previously called + * setup_leaf_for_split() to make room for the new item in the leaf. + */ + if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item))) + return -ENOSPC; + + orig_slot = path->slots[0]; + orig_offset = btrfs_item_offset(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); + + buf = kmalloc(item_size, GFP_NOFS); + if (!buf) + return -ENOMEM; + + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, + path->slots[0]), item_size); + + slot = path->slots[0] + 1; + nritems = btrfs_header_nritems(leaf); + if (slot != nritems) { + /* shift the items */ + memmove_leaf_items(leaf, slot + 1, slot, nritems - slot); + } + + btrfs_cpu_key_to_disk(&disk_key, new_key); + btrfs_set_item_key(leaf, &disk_key, slot); + + btrfs_set_item_offset(leaf, slot, orig_offset); + btrfs_set_item_size(leaf, slot, item_size - split_offset); + + btrfs_set_item_offset(leaf, orig_slot, + orig_offset + item_size - split_offset); + btrfs_set_item_size(leaf, orig_slot, split_offset); + + btrfs_set_header_nritems(leaf, nritems + 1); + + /* write the data for the start of the original item */ + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + split_offset); + + /* write the data for the new item */ + write_extent_buffer(leaf, buf + split_offset, + btrfs_item_ptr_offset(leaf, slot), + item_size - split_offset); + btrfs_mark_buffer_dirty(trans, leaf); + + BUG_ON(btrfs_leaf_free_space(leaf) < 0); + kfree(buf); + return 0; +} + +/* + * This function splits a single item into two items, + * giving 'new_key' to the new item and splitting the + * old one at split_offset (from the start of the item). + * + * The path may be released by this operation. After + * the split, the path is pointing to the old item. The + * new item is going to be in the same node as the old one. + * + * Note, the item being split must be smaller enough to live alone on + * a tree block with room for one extra struct btrfs_item + * + * This allows us to split the item in place, keeping a lock on the + * leaf the entire time. + */ +int btrfs_split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *new_key, + unsigned long split_offset) +{ + int ret; + ret = setup_leaf_for_split(trans, root, path, + sizeof(struct btrfs_item)); + if (ret) + return ret; + + ret = split_item(trans, path, new_key, split_offset); + return ret; +} + +/* + * make the item pointed to by the path smaller. new_size indicates + * how small to make it, and from_end tells us if we just chop bytes + * off the end of the item or if we shift the item to chop bytes off + * the front. + */ +void btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u32 new_size, int from_end) +{ + int slot; + struct extent_buffer *leaf; + u32 nritems; + unsigned int data_end; + unsigned int old_data_start; + unsigned int old_size; + unsigned int size_diff; + int i; + struct btrfs_map_token token; + + leaf = path->nodes[0]; + slot = path->slots[0]; + + old_size = btrfs_item_size(leaf, slot); + if (old_size == new_size) + return; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(leaf); + + old_data_start = btrfs_item_offset(leaf, slot); + + size_diff = old_size - new_size; + + BUG_ON(slot < 0); + BUG_ON(slot >= nritems); + + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + btrfs_init_map_token(&token, leaf); + for (i = slot; i < nritems; i++) { + u32 ioff; + + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff + size_diff); + } + + /* shift the data */ + if (from_end) { + memmove_leaf_data(leaf, data_end + size_diff, data_end, + old_data_start + new_size - data_end); + } else { + struct btrfs_disk_key disk_key; + u64 offset; + + btrfs_item_key(leaf, &disk_key, slot); + + if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { + unsigned long ptr; + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + fi = (struct btrfs_file_extent_item *)( + (unsigned long)fi - size_diff); + + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) { + ptr = btrfs_item_ptr_offset(leaf, slot); + memmove_extent_buffer(leaf, ptr, + (unsigned long)fi, + BTRFS_FILE_EXTENT_INLINE_DATA_START); + } + } + + memmove_leaf_data(leaf, data_end + size_diff, data_end, + old_data_start - data_end); + + offset = btrfs_disk_key_offset(&disk_key); + btrfs_set_disk_key_offset(&disk_key, offset + size_diff); + btrfs_set_item_key(leaf, &disk_key, slot); + if (slot == 0) + fixup_low_keys(trans, path, &disk_key, 1); + } + + btrfs_set_item_size(leaf, slot, new_size); + btrfs_mark_buffer_dirty(trans, leaf); + + if (btrfs_leaf_free_space(leaf) < 0) { + btrfs_print_leaf(leaf); + BUG(); + } +} + +/* + * make the item pointed to by the path bigger, data_size is the added size. + */ +void btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u32 data_size) +{ + int slot; + struct extent_buffer *leaf; + u32 nritems; + unsigned int data_end; + unsigned int old_data; + unsigned int old_size; + int i; + struct btrfs_map_token token; + + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(leaf); + + if (btrfs_leaf_free_space(leaf) < data_size) { + btrfs_print_leaf(leaf); + BUG(); + } + slot = path->slots[0]; + old_data = btrfs_item_data_end(leaf, slot); + + BUG_ON(slot < 0); + if (slot >= nritems) { + btrfs_print_leaf(leaf); + btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", + slot, nritems); + BUG(); + } + + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + btrfs_init_map_token(&token, leaf); + for (i = slot; i < nritems; i++) { + u32 ioff; + + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff - data_size); + } + + /* shift the data */ + memmove_leaf_data(leaf, data_end - data_size, data_end, + old_data - data_end); + + data_end = old_data; + old_size = btrfs_item_size(leaf, slot); + btrfs_set_item_size(leaf, slot, old_size + data_size); + btrfs_mark_buffer_dirty(trans, leaf); + + if (btrfs_leaf_free_space(leaf) < 0) { + btrfs_print_leaf(leaf); + BUG(); + } +} + +/* + * Make space in the node before inserting one or more items. + * + * @trans: transaction handle + * @root: root we are inserting items to + * @path: points to the leaf/slot where we are going to insert new items + * @batch: information about the batch of items to insert + * + * Main purpose is to save stack depth by doing the bulk of the work in a + * function that doesn't call btrfs_search_slot + */ +static void setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + const struct btrfs_item_batch *batch) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int i; + u32 nritems; + unsigned int data_end; + struct btrfs_disk_key disk_key; + struct extent_buffer *leaf; + int slot; + struct btrfs_map_token token; + u32 total_size; + + /* + * Before anything else, update keys in the parent and other ancestors + * if needed, then release the write locks on them, so that other tasks + * can use them while we modify the leaf. + */ + if (path->slots[0] == 0) { + btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); + fixup_low_keys(trans, path, &disk_key, 1); + } + btrfs_unlock_up_safe(path, 1); + + leaf = path->nodes[0]; + slot = path->slots[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(leaf); + total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); + + if (btrfs_leaf_free_space(leaf) < total_size) { + btrfs_print_leaf(leaf); + btrfs_crit(fs_info, "not enough freespace need %u have %d", + total_size, btrfs_leaf_free_space(leaf)); + BUG(); + } + + btrfs_init_map_token(&token, leaf); + if (slot != nritems) { + unsigned int old_data = btrfs_item_data_end(leaf, slot); + + if (old_data < data_end) { + btrfs_print_leaf(leaf); + btrfs_crit(fs_info, + "item at slot %d with data offset %u beyond data end of leaf %u", + slot, old_data, data_end); + BUG(); + } + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + for (i = slot; i < nritems; i++) { + u32 ioff; + + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, + ioff - batch->total_data_size); + } + /* shift the items */ + memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot); + + /* shift the data */ + memmove_leaf_data(leaf, data_end - batch->total_data_size, + data_end, old_data - data_end); + data_end = old_data; + } + + /* setup the item for the new data */ + for (i = 0; i < batch->nr; i++) { + btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); + btrfs_set_item_key(leaf, &disk_key, slot + i); + data_end -= batch->data_sizes[i]; + btrfs_set_token_item_offset(&token, slot + i, data_end); + btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]); + } + + btrfs_set_header_nritems(leaf, nritems + batch->nr); + btrfs_mark_buffer_dirty(trans, leaf); + + if (btrfs_leaf_free_space(leaf) < 0) { + btrfs_print_leaf(leaf); + BUG(); + } +} + +/* + * Insert a new item into a leaf. + * + * @trans: Transaction handle. + * @root: The root of the btree. + * @path: A path pointing to the target leaf and slot. + * @key: The key of the new item. + * @data_size: The size of the data associated with the new key. + */ +void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *key, + u32 data_size) +{ + struct btrfs_item_batch batch; + + batch.keys = key; + batch.data_sizes = &data_size; + batch.total_data_size = data_size; + batch.nr = 1; + + setup_items_for_insert(trans, root, path, &batch); +} + +/* + * Given a key and some data, insert items into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_item_batch *batch) +{ + int ret = 0; + int slot; + u32 total_size; + + total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); + ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1); + if (ret == 0) + return -EEXIST; + if (ret < 0) + return ret; + + slot = path->slots[0]; + BUG_ON(slot < 0); + + setup_items_for_insert(trans, root, path, batch); + return 0; +} + +/* + * Given a key and some data, insert an item into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *cpu_key, void *data, + u32 data_size) +{ + int ret = 0; + struct btrfs_path *path; + struct extent_buffer *leaf; + unsigned long ptr; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); + if (!ret) { + leaf = path->nodes[0]; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, data, ptr, data_size); + btrfs_mark_buffer_dirty(trans, leaf); + } + btrfs_free_path(path); + return ret; +} + +/* + * This function duplicates an item, giving 'new_key' to the new item. + * It guarantees both items live in the same tree leaf and the new item is + * contiguous with the original item. + * + * This allows us to split a file extent in place, keeping a lock on the leaf + * the entire time. + */ +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *new_key) +{ + struct extent_buffer *leaf; + int ret; + u32 item_size; + + leaf = path->nodes[0]; + item_size = btrfs_item_size(leaf, path->slots[0]); + ret = setup_leaf_for_split(trans, root, path, + item_size + sizeof(struct btrfs_item)); + if (ret) + return ret; + + path->slots[0]++; + btrfs_setup_item_for_insert(trans, root, path, new_key, item_size); + leaf = path->nodes[0]; + memcpy_extent_buffer(leaf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + btrfs_item_ptr_offset(leaf, path->slots[0] - 1), + item_size); + return 0; +} + +/* + * delete the pointer from a given node. + * + * the tree should have been previously balanced so the deletion does not + * empty a node. + * + * This is exported for use inside btrfs-progs, don't un-export it. + */ +int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot) +{ + struct extent_buffer *parent = path->nodes[level]; + u32 nritems; + int ret; + + nritems = btrfs_header_nritems(parent); + if (slot != nritems - 1) { + if (level) { + ret = btrfs_tree_mod_log_insert_move(parent, slot, + slot + 1, nritems - slot - 1); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + memmove_extent_buffer(parent, + btrfs_node_key_ptr_offset(parent, slot), + btrfs_node_key_ptr_offset(parent, slot + 1), + sizeof(struct btrfs_key_ptr) * + (nritems - slot - 1)); + } else if (level) { + ret = btrfs_tree_mod_log_insert_key(parent, slot, + BTRFS_MOD_LOG_KEY_REMOVE); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + + nritems--; + btrfs_set_header_nritems(parent, nritems); + if (nritems == 0 && parent == root->node) { + BUG_ON(btrfs_header_level(root->node) != 1); + /* just turn the root into a leaf and break */ + btrfs_set_header_level(root->node, 0); + } else if (slot == 0) { + struct btrfs_disk_key disk_key; + + btrfs_node_key(parent, &disk_key, 0); + fixup_low_keys(trans, path, &disk_key, level + 1); + } + btrfs_mark_buffer_dirty(trans, parent); + return 0; +} + +/* + * a helper function to delete the leaf pointed to by path->slots[1] and + * path->nodes[1]. + * + * This deletes the pointer in path->nodes[1] and frees the leaf + * block extent. zero is returned if it all worked out, < 0 otherwise. + * + * The path must have already been setup for deleting the leaf, including + * all the proper balancing. path->nodes[1] must be locked. + */ +static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf) +{ + int ret; + + WARN_ON(btrfs_header_generation(leaf) != trans->transid); + ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]); + if (ret < 0) + return ret; + + /* + * btrfs_free_extent is expensive, we want to make sure we + * aren't holding any locks when we call it + */ + btrfs_unlock_up_safe(path, 0); + + root_sub_used(root, leaf->len); + + atomic_inc(&leaf->refs); + btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); + free_extent_buffer_stale(leaf); + return 0; +} +/* + * delete the item at the leaf level in path. If that empties + * the leaf, remove it from the tree + */ +int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int slot, int nr) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *leaf; + int ret = 0; + int wret; + u32 nritems; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + + if (slot + nr != nritems) { + const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); + const int data_end = leaf_data_end(leaf); + struct btrfs_map_token token; + u32 dsize = 0; + int i; + + for (i = 0; i < nr; i++) + dsize += btrfs_item_size(leaf, slot + i); + + memmove_leaf_data(leaf, data_end + dsize, data_end, + last_off - data_end); + + btrfs_init_map_token(&token, leaf); + for (i = slot + nr; i < nritems; i++) { + u32 ioff; + + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff + dsize); + } + + memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); + } + btrfs_set_header_nritems(leaf, nritems - nr); + nritems -= nr; + + /* delete the leaf if we've emptied it */ + if (nritems == 0) { + if (leaf == root->node) { + btrfs_set_header_level(leaf, 0); + } else { + btrfs_clear_buffer_dirty(trans, leaf); + ret = btrfs_del_leaf(trans, root, path, leaf); + if (ret < 0) + return ret; + } + } else { + int used = leaf_space_used(leaf, 0, nritems); + if (slot == 0) { + struct btrfs_disk_key disk_key; + + btrfs_item_key(leaf, &disk_key, 0); + fixup_low_keys(trans, path, &disk_key, 1); + } + + /* + * Try to delete the leaf if it is mostly empty. We do this by + * trying to move all its items into its left and right neighbours. + * If we can't move all the items, then we don't delete it - it's + * not ideal, but future insertions might fill the leaf with more + * items, or items from other leaves might be moved later into our + * leaf due to deletions on those leaves. + */ + if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { + u32 min_push_space; + + /* push_leaf_left fixes the path. + * make sure the path still points to our leaf + * for possible call to btrfs_del_ptr below + */ + slot = path->slots[1]; + atomic_inc(&leaf->refs); + /* + * We want to be able to at least push one item to the + * left neighbour leaf, and that's the first item. + */ + min_push_space = sizeof(struct btrfs_item) + + btrfs_item_size(leaf, 0); + wret = push_leaf_left(trans, root, path, 0, + min_push_space, 1, (u32)-1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + + if (path->nodes[0] == leaf && + btrfs_header_nritems(leaf)) { + /* + * If we were not able to push all items from our + * leaf to its left neighbour, then attempt to + * either push all the remaining items to the + * right neighbour or none. There's no advantage + * in pushing only some items, instead of all, as + * it's pointless to end up with a leaf having + * too few items while the neighbours can be full + * or nearly full. + */ + nritems = btrfs_header_nritems(leaf); + min_push_space = leaf_space_used(leaf, 0, nritems); + wret = push_leaf_right(trans, root, path, 0, + min_push_space, 1, 0); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + } + + if (btrfs_header_nritems(leaf) == 0) { + path->slots[1] = slot; + ret = btrfs_del_leaf(trans, root, path, leaf); + if (ret < 0) + return ret; + free_extent_buffer(leaf); + ret = 0; + } else { + /* if we're still in the path, make sure + * we're dirty. Otherwise, one of the + * push_leaf functions must have already + * dirtied this buffer + */ + if (path->nodes[0] == leaf) + btrfs_mark_buffer_dirty(trans, leaf); + free_extent_buffer(leaf); + } + } else { + btrfs_mark_buffer_dirty(trans, leaf); + } + } + return ret; +} + +/* + * A helper function to walk down the tree starting at min_key, and looking + * for nodes or leaves that are have a minimum transaction id. + * This is used by the btree defrag code, and tree logging + * + * This does not cow, but it does stuff the starting key it finds back + * into min_key, so you can call btrfs_search_slot with cow=1 on the + * key and get a writable path. + * + * This honors path->lowest_level to prevent descent past a given level + * of the tree. + * + * min_trans indicates the oldest transaction that you are interested + * in walking through. Any nodes or leaves older than min_trans are + * skipped over (without reading them). + * + * returns zero if something useful was found, < 0 on error and 1 if there + * was nothing in the tree that matched the search criteria. + */ +int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_path *path, + u64 min_trans) +{ + struct extent_buffer *cur; + struct btrfs_key found_key; + int slot; + int sret; + u32 nritems; + int level; + int ret = 1; + int keep_locks = path->keep_locks; + + ASSERT(!path->nowait); + path->keep_locks = 1; +again: + cur = btrfs_read_lock_root_node(root); + level = btrfs_header_level(cur); + WARN_ON(path->nodes[level]); + path->nodes[level] = cur; + path->locks[level] = BTRFS_READ_LOCK; + + if (btrfs_header_generation(cur) < min_trans) { + ret = 1; + goto out; + } + while (1) { + nritems = btrfs_header_nritems(cur); + level = btrfs_header_level(cur); + sret = btrfs_bin_search(cur, 0, min_key, &slot); + if (sret < 0) { + ret = sret; + goto out; + } + + /* at the lowest level, we're done, setup the path and exit */ + if (level == path->lowest_level) { + if (slot >= nritems) + goto find_next_key; + ret = 0; + path->slots[level] = slot; + btrfs_item_key_to_cpu(cur, &found_key, slot); + goto out; + } + if (sret && slot > 0) + slot--; + /* + * check this node pointer against the min_trans parameters. + * If it is too old, skip to the next one. + */ + while (slot < nritems) { + u64 gen; + + gen = btrfs_node_ptr_generation(cur, slot); + if (gen < min_trans) { + slot++; + continue; + } + break; + } +find_next_key: + /* + * we didn't find a candidate key in this node, walk forward + * and find another one + */ + if (slot >= nritems) { + path->slots[level] = slot; + sret = btrfs_find_next_key(root, path, min_key, level, + min_trans); + if (sret == 0) { + btrfs_release_path(path); + goto again; + } else { + goto out; + } + } + /* save our key for returning back */ + btrfs_node_key_to_cpu(cur, &found_key, slot); + path->slots[level] = slot; + if (level == path->lowest_level) { + ret = 0; + goto out; + } + cur = btrfs_read_node_slot(cur, slot); + if (IS_ERR(cur)) { + ret = PTR_ERR(cur); + goto out; + } + + btrfs_tree_read_lock(cur); + + path->locks[level - 1] = BTRFS_READ_LOCK; + path->nodes[level - 1] = cur; + unlock_up(path, level, 1, 0, NULL); + } +out: + path->keep_locks = keep_locks; + if (ret == 0) { + btrfs_unlock_up_safe(path, path->lowest_level + 1); + memcpy(min_key, &found_key, sizeof(found_key)); + } + return ret; +} + +/* + * this is similar to btrfs_next_leaf, but does not try to preserve + * and fixup the path. It looks for and returns the next key in the + * tree based on the current path and the min_trans parameters. + * + * 0 is returned if another key is found, < 0 if there are any errors + * and 1 is returned if there are no higher keys in the tree + * + * path->keep_locks should be set to 1 on the search made before + * calling this function. + */ +int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, int level, u64 min_trans) +{ + int slot; + struct extent_buffer *c; + + WARN_ON(!path->keep_locks && !path->skip_locking); + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) + return 1; + + slot = path->slots[level] + 1; + c = path->nodes[level]; +next: + if (slot >= btrfs_header_nritems(c)) { + int ret; + int orig_lowest; + struct btrfs_key cur_key; + if (level + 1 >= BTRFS_MAX_LEVEL || + !path->nodes[level + 1]) + return 1; + + if (path->locks[level + 1] || path->skip_locking) { + level++; + continue; + } + + slot = btrfs_header_nritems(c) - 1; + if (level == 0) + btrfs_item_key_to_cpu(c, &cur_key, slot); + else + btrfs_node_key_to_cpu(c, &cur_key, slot); + + orig_lowest = path->lowest_level; + btrfs_release_path(path); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, &cur_key, path, + 0, 0); + path->lowest_level = orig_lowest; + if (ret < 0) + return ret; + + c = path->nodes[level]; + slot = path->slots[level]; + if (ret == 0) + slot++; + goto next; + } + + if (level == 0) + btrfs_item_key_to_cpu(c, key, slot); + else { + u64 gen = btrfs_node_ptr_generation(c, slot); + + if (gen < min_trans) { + slot++; + goto next; + } + btrfs_node_key_to_cpu(c, key, slot); + } + return 0; + } + return 1; +} + +int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, + u64 time_seq) +{ + int slot; + int level; + struct extent_buffer *c; + struct extent_buffer *next; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key key; + bool need_commit_sem = false; + u32 nritems; + int ret; + int i; + + /* + * The nowait semantics are used only for write paths, where we don't + * use the tree mod log and sequence numbers. + */ + if (time_seq) + ASSERT(!path->nowait); + + nritems = btrfs_header_nritems(path->nodes[0]); + if (nritems == 0) + return 1; + + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); +again: + level = 1; + next = NULL; + btrfs_release_path(path); + + path->keep_locks = 1; + + if (time_seq) { + ret = btrfs_search_old_slot(root, &key, path, time_seq); + } else { + if (path->need_commit_sem) { + path->need_commit_sem = 0; + need_commit_sem = true; + if (path->nowait) { + if (!down_read_trylock(&fs_info->commit_root_sem)) { + ret = -EAGAIN; + goto done; + } + } else { + down_read(&fs_info->commit_root_sem); + } + } + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + } + path->keep_locks = 0; + + if (ret < 0) + goto done; + + nritems = btrfs_header_nritems(path->nodes[0]); + /* + * by releasing the path above we dropped all our locks. A balance + * could have added more items next to the key that used to be + * at the very end of the block. So, check again here and + * advance the path if there are now more items available. + */ + if (nritems > 0 && path->slots[0] < nritems - 1) { + if (ret == 0) + path->slots[0]++; + ret = 0; + goto done; + } + /* + * So the above check misses one case: + * - after releasing the path above, someone has removed the item that + * used to be at the very end of the block, and balance between leafs + * gets another one with bigger key.offset to replace it. + * + * This one should be returned as well, or we can get leaf corruption + * later(esp. in __btrfs_drop_extents()). + * + * And a bit more explanation about this check, + * with ret > 0, the key isn't found, the path points to the slot + * where it should be inserted, so the path->slots[0] item must be the + * bigger one. + */ + if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { + ret = 0; + goto done; + } + + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) { + ret = 1; + goto done; + } + + slot = path->slots[level] + 1; + c = path->nodes[level]; + if (slot >= btrfs_header_nritems(c)) { + level++; + if (level == BTRFS_MAX_LEVEL) { + ret = 1; + goto done; + } + continue; + } + + + /* + * Our current level is where we're going to start from, and to + * make sure lockdep doesn't complain we need to drop our locks + * and nodes from 0 to our current level. + */ + for (i = 0; i < level; i++) { + if (path->locks[level]) { + btrfs_tree_read_unlock(path->nodes[i]); + path->locks[i] = 0; + } + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + + next = c; + ret = read_block_for_search(root, path, &next, level, + slot, &key); + if (ret == -EAGAIN && !path->nowait) + goto again; + + if (ret < 0) { + btrfs_release_path(path); + goto done; + } + + if (!path->skip_locking) { + ret = btrfs_try_tree_read_lock(next); + if (!ret && path->nowait) { + ret = -EAGAIN; + goto done; + } + if (!ret && time_seq) { + /* + * If we don't get the lock, we may be racing + * with push_leaf_left, holding that lock while + * itself waiting for the leaf we've currently + * locked. To solve this situation, we give up + * on our lock and cycle. + */ + free_extent_buffer(next); + btrfs_release_path(path); + cond_resched(); + goto again; + } + if (!ret) + btrfs_tree_read_lock(next); + } + break; + } + path->slots[level] = slot; + while (1) { + level--; + path->nodes[level] = next; + path->slots[level] = 0; + if (!path->skip_locking) + path->locks[level] = BTRFS_READ_LOCK; + if (!level) + break; + + ret = read_block_for_search(root, path, &next, level, + 0, &key); + if (ret == -EAGAIN && !path->nowait) + goto again; + + if (ret < 0) { + btrfs_release_path(path); + goto done; + } + + if (!path->skip_locking) { + if (path->nowait) { + if (!btrfs_try_tree_read_lock(next)) { + ret = -EAGAIN; + goto done; + } + } else { + btrfs_tree_read_lock(next); + } + } + } + ret = 0; +done: + unlock_up(path, 0, 1, 0, NULL); + if (need_commit_sem) { + int ret2; + + path->need_commit_sem = 1; + ret2 = finish_need_commit_sem_search(path); + up_read(&fs_info->commit_root_sem); + if (ret2) + ret = ret2; + } + + return ret; +} + +int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) +{ + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) + return btrfs_next_old_leaf(root, path, time_seq); + return 0; +} + +/* + * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps + * searching until it gets past min_objectid or finds an item of 'type' + * + * returns 0 if something is found, 1 if nothing was found and < 0 on error + */ +int btrfs_previous_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid, + int type) +{ + struct btrfs_key found_key; + struct extent_buffer *leaf; + u32 nritems; + int ret; + + while (1) { + if (path->slots[0] == 0) { + ret = btrfs_prev_leaf(root, path); + if (ret != 0) + return ret; + } else { + path->slots[0]--; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (nritems == 0) + return 1; + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid < min_objectid) + break; + if (found_key.type == type) + return 0; + if (found_key.objectid == min_objectid && + found_key.type < type) + break; + } + return 1; +} + +/* + * search in extent tree to find a previous Metadata/Data extent item with + * min objecitd. + * + * returns 0 if something is found, 1 if nothing was found and < 0 on error + */ +int btrfs_previous_extent_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid) +{ + struct btrfs_key found_key; + struct extent_buffer *leaf; + u32 nritems; + int ret; + + while (1) { + if (path->slots[0] == 0) { + ret = btrfs_prev_leaf(root, path); + if (ret != 0) + return ret; + } else { + path->slots[0]--; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (nritems == 0) + return 1; + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid < min_objectid) + break; + if (found_key.type == BTRFS_EXTENT_ITEM_KEY || + found_key.type == BTRFS_METADATA_ITEM_KEY) + return 0; + if (found_key.objectid == min_objectid && + found_key.type < BTRFS_EXTENT_ITEM_KEY) + break; + } + return 1; +} + +int __init btrfs_ctree_init(void) +{ + btrfs_path_cachep = kmem_cache_create("btrfs_path", + sizeof(struct btrfs_path), 0, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_path_cachep) + return -ENOMEM; + return 0; +} + +void __cold btrfs_ctree_exit(void) +{ + kmem_cache_destroy(btrfs_path_cachep); +} diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h new file mode 100644 index 0000000000..06333a74d6 --- /dev/null +++ b/fs/btrfs/ctree.h @@ -0,0 +1,730 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#ifndef BTRFS_CTREE_H +#define BTRFS_CTREE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "extent-io-tree.h" +#include "extent_io.h" +#include "extent_map.h" +#include "async-thread.h" +#include "block-rsv.h" +#include "locking.h" +#include "misc.h" +#include "fs.h" + +struct btrfs_trans_handle; +struct btrfs_transaction; +struct btrfs_pending_snapshot; +struct btrfs_delayed_ref_root; +struct btrfs_space_info; +struct btrfs_block_group; +struct btrfs_ordered_sum; +struct btrfs_ref; +struct btrfs_bio; +struct btrfs_ioctl_encoded_io_args; +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_balance_control; +struct btrfs_delayed_root; +struct reloc_control; + +/* Read ahead values for struct btrfs_path.reada */ +enum { + READA_NONE, + READA_BACK, + READA_FORWARD, + /* + * Similar to READA_FORWARD but unlike it: + * + * 1) It will trigger readahead even for leaves that are not close to + * each other on disk; + * 2) It also triggers readahead for nodes; + * 3) During a search, even when a node or leaf is already in memory, it + * will still trigger readahead for other nodes and leaves that follow + * it. + * + * This is meant to be used only when we know we are iterating over the + * entire tree or a very large part of it. + */ + READA_FORWARD_ALWAYS, +}; + +/* + * btrfs_paths remember the path taken from the root down to the leaf. + * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point + * to any other levels that are present. + * + * The slots array records the index of the item or block pointer + * used while walking the tree. + */ +struct btrfs_path { + struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; + int slots[BTRFS_MAX_LEVEL]; + /* if there is real range locking, this locks field will change */ + u8 locks[BTRFS_MAX_LEVEL]; + u8 reada; + /* keep some upper locks as we walk down */ + u8 lowest_level; + + /* + * set by btrfs_split_item, tells search_slot to keep all locks + * and to force calls to keep space in the nodes + */ + unsigned int search_for_split:1; + unsigned int keep_locks:1; + unsigned int skip_locking:1; + unsigned int search_commit_root:1; + unsigned int need_commit_sem:1; + unsigned int skip_release_on_error:1; + /* + * Indicate that new item (btrfs_search_slot) is extending already + * existing item and ins_len contains only the data size and not item + * header (ie. sizeof(struct btrfs_item) is not included). + */ + unsigned int search_for_extension:1; + /* Stop search if any locks need to be taken (for read) */ + unsigned int nowait:1; +}; + +/* + * The state of btrfs root + */ +enum { + /* + * btrfs_record_root_in_trans is a multi-step process, and it can race + * with the balancing code. But the race is very small, and only the + * first time the root is added to each transaction. So IN_TRANS_SETUP + * is used to tell us when more checks are required + */ + BTRFS_ROOT_IN_TRANS_SETUP, + + /* + * Set if tree blocks of this root can be shared by other roots. + * Only subvolume trees and their reloc trees have this bit set. + * Conflicts with TRACK_DIRTY bit. + * + * This affects two things: + * + * - How balance works + * For shareable roots, we need to use reloc tree and do path + * replacement for balance, and need various pre/post hooks for + * snapshot creation to handle them. + * + * While for non-shareable trees, we just simply do a tree search + * with COW. + * + * - How dirty roots are tracked + * For shareable roots, btrfs_record_root_in_trans() is needed to + * track them, while non-subvolume roots have TRACK_DIRTY bit, they + * don't need to set this manually. + */ + BTRFS_ROOT_SHAREABLE, + BTRFS_ROOT_TRACK_DIRTY, + BTRFS_ROOT_IN_RADIX, + BTRFS_ROOT_ORPHAN_ITEM_INSERTED, + BTRFS_ROOT_DEFRAG_RUNNING, + BTRFS_ROOT_FORCE_COW, + BTRFS_ROOT_MULTI_LOG_TASKS, + BTRFS_ROOT_DIRTY, + BTRFS_ROOT_DELETING, + + /* + * Reloc tree is orphan, only kept here for qgroup delayed subtree scan + * + * Set for the subvolume tree owning the reloc tree. + */ + BTRFS_ROOT_DEAD_RELOC_TREE, + /* Mark dead root stored on device whose cleanup needs to be resumed */ + BTRFS_ROOT_DEAD_TREE, + /* The root has a log tree. Used for subvolume roots and the tree root. */ + BTRFS_ROOT_HAS_LOG_TREE, + /* Qgroup flushing is in progress */ + BTRFS_ROOT_QGROUP_FLUSHING, + /* We started the orphan cleanup for this root. */ + BTRFS_ROOT_ORPHAN_CLEANUP, + /* This root has a drop operation that was started previously. */ + BTRFS_ROOT_UNFINISHED_DROP, + /* This reloc root needs to have its buffers lockdep class reset. */ + BTRFS_ROOT_RESET_LOCKDEP_CLASS, +}; + +/* + * Record swapped tree blocks of a subvolume tree for delayed subtree trace + * code. For detail check comment in fs/btrfs/qgroup.c. + */ +struct btrfs_qgroup_swapped_blocks { + spinlock_t lock; + /* RM_EMPTY_ROOT() of above blocks[] */ + bool swapped; + struct rb_root blocks[BTRFS_MAX_LEVEL]; +}; + +/* + * in ram representation of the tree. extent_root is used for all allocations + * and for the extent tree extent_root root. + */ +struct btrfs_root { + struct rb_node rb_node; + + struct extent_buffer *node; + + struct extent_buffer *commit_root; + struct btrfs_root *log_root; + struct btrfs_root *reloc_root; + + unsigned long state; + struct btrfs_root_item root_item; + struct btrfs_key root_key; + struct btrfs_fs_info *fs_info; + struct extent_io_tree dirty_log_pages; + + struct mutex objectid_mutex; + + spinlock_t accounting_lock; + struct btrfs_block_rsv *block_rsv; + + struct mutex log_mutex; + wait_queue_head_t log_writer_wait; + wait_queue_head_t log_commit_wait[2]; + struct list_head log_ctxs[2]; + /* Used only for log trees of subvolumes, not for the log root tree */ + atomic_t log_writers; + atomic_t log_commit[2]; + /* Used only for log trees of subvolumes, not for the log root tree */ + atomic_t log_batch; + int log_transid; + /* No matter the commit succeeds or not*/ + int log_transid_committed; + /* Just be updated when the commit succeeds. */ + int last_log_commit; + pid_t log_start_pid; + + u64 last_trans; + + u32 type; + + u64 free_objectid; + + struct btrfs_key defrag_progress; + struct btrfs_key defrag_max; + + /* The dirty list is only used by non-shareable roots */ + struct list_head dirty_list; + + struct list_head root_list; + + spinlock_t log_extents_lock[2]; + struct list_head logged_list[2]; + + spinlock_t inode_lock; + /* red-black tree that keeps track of in-memory inodes */ + struct rb_root inode_tree; + + /* + * radix tree that keeps track of delayed nodes of every inode, + * protected by inode_lock + */ + struct radix_tree_root delayed_nodes_tree; + /* + * right now this just gets used so that a root has its own devid + * for stat. It may be used for more later + */ + dev_t anon_dev; + + spinlock_t root_item_lock; + refcount_t refs; + + struct mutex delalloc_mutex; + spinlock_t delalloc_lock; + /* + * all of the inodes that have delalloc bytes. It is possible for + * this list to be empty even when there is still dirty data=ordered + * extents waiting to finish IO. + */ + struct list_head delalloc_inodes; + struct list_head delalloc_root; + u64 nr_delalloc_inodes; + + struct mutex ordered_extent_mutex; + /* + * this is used by the balancing code to wait for all the pending + * ordered extents + */ + spinlock_t ordered_extent_lock; + + /* + * all of the data=ordered extents pending writeback + * these can span multiple transactions and basically include + * every dirty data page that isn't from nodatacow + */ + struct list_head ordered_extents; + struct list_head ordered_root; + u64 nr_ordered_extents; + + /* + * Not empty if this subvolume root has gone through tree block swap + * (relocation) + * + * Will be used by reloc_control::dirty_subvol_roots. + */ + struct list_head reloc_dirty_list; + + /* + * Number of currently running SEND ioctls to prevent + * manipulation with the read-only status via SUBVOL_SETFLAGS + */ + int send_in_progress; + /* + * Number of currently running deduplication operations that have a + * destination inode belonging to this root. Protected by the lock + * root_item_lock. + */ + int dedupe_in_progress; + /* For exclusion of snapshot creation and nocow writes */ + struct btrfs_drew_lock snapshot_lock; + + atomic_t snapshot_force_cow; + + /* For qgroup metadata reserved space */ + spinlock_t qgroup_meta_rsv_lock; + u64 qgroup_meta_rsv_pertrans; + u64 qgroup_meta_rsv_prealloc; + wait_queue_head_t qgroup_flush_wait; + + /* Number of active swapfiles */ + atomic_t nr_swapfiles; + + /* Record pairs of swapped blocks for qgroup */ + struct btrfs_qgroup_swapped_blocks swapped_blocks; + + /* Used only by log trees, when logging csum items */ + struct extent_io_tree log_csum_range; + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + u64 alloc_bytenr; +#endif + +#ifdef CONFIG_BTRFS_DEBUG + struct list_head leak_list; +#endif +}; + +static inline bool btrfs_root_readonly(const struct btrfs_root *root) +{ + /* Byte-swap the constant at compile time, root_item::flags is LE */ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; +} + +static inline bool btrfs_root_dead(const struct btrfs_root *root) +{ + /* Byte-swap the constant at compile time, root_item::flags is LE */ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; +} + +static inline u64 btrfs_root_id(const struct btrfs_root *root) +{ + return root->root_key.objectid; +} + +/* + * Structure that conveys information about an extent that is going to replace + * all the extents in a file range. + */ +struct btrfs_replace_extent_info { + u64 disk_offset; + u64 disk_len; + u64 data_offset; + u64 data_len; + u64 file_offset; + /* Pointer to a file extent item of type regular or prealloc. */ + char *extent_buf; + /* + * Set to true when attempting to replace a file range with a new extent + * described by this structure, set to false when attempting to clone an + * existing extent into a file range. + */ + bool is_new_extent; + /* Indicate if we should update the inode's mtime and ctime. */ + bool update_times; + /* Meaningful only if is_new_extent is true. */ + int qgroup_reserved; + /* + * Meaningful only if is_new_extent is true. + * Used to track how many extent items we have already inserted in a + * subvolume tree that refer to the extent described by this structure, + * so that we know when to create a new delayed ref or update an existing + * one. + */ + int insertions; +}; + +/* Arguments for btrfs_drop_extents() */ +struct btrfs_drop_extents_args { + /* Input parameters */ + + /* + * If NULL, btrfs_drop_extents() will allocate and free its own path. + * If 'replace_extent' is true, this must not be NULL. Also the path + * is always released except if 'replace_extent' is true and + * btrfs_drop_extents() sets 'extent_inserted' to true, in which case + * the path is kept locked. + */ + struct btrfs_path *path; + /* Start offset of the range to drop extents from */ + u64 start; + /* End (exclusive, last byte + 1) of the range to drop extents from */ + u64 end; + /* If true drop all the extent maps in the range */ + bool drop_cache; + /* + * If true it means we want to insert a new extent after dropping all + * the extents in the range. If this is true, the 'extent_item_size' + * parameter must be set as well and the 'extent_inserted' field will + * be set to true by btrfs_drop_extents() if it could insert the new + * extent. + * Note: when this is set to true the path must not be NULL. + */ + bool replace_extent; + /* + * Used if 'replace_extent' is true. Size of the file extent item to + * insert after dropping all existing extents in the range + */ + u32 extent_item_size; + + /* Output parameters */ + + /* + * Set to the minimum between the input parameter 'end' and the end + * (exclusive, last byte + 1) of the last dropped extent. This is always + * set even if btrfs_drop_extents() returns an error. + */ + u64 drop_end; + /* + * The number of allocated bytes found in the range. This can be smaller + * than the range's length when there are holes in the range. + */ + u64 bytes_found; + /* + * Only set if 'replace_extent' is true. Set to true if we were able + * to insert a replacement extent after dropping all extents in the + * range, otherwise set to false by btrfs_drop_extents(). + * Also, if btrfs_drop_extents() has set this to true it means it + * returned with the path locked, otherwise if it has set this to + * false it has returned with the path released. + */ + bool extent_inserted; +}; + +struct btrfs_file_private { + void *filldir_buf; + u64 last_index; + struct extent_state *llseek_cached_state; +}; + +static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) +{ + return info->nodesize - sizeof(struct btrfs_header); +} + +static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info) +{ + return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item); +} + +static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info) +{ + return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr); +} + +static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) +{ + return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); +} + +#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ + ((bytes) >> (fs_info)->sectorsize_bits) + +static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) +{ + return crc32c(crc, address, length); +} + +static inline void btrfs_crc32c_final(u32 crc, u8 *result) +{ + put_unaligned_le32(~crc, result); +} + +static inline u64 btrfs_name_hash(const char *name, int len) +{ + return crc32c((u32)~1, name, len); +} + +/* + * Figure the key offset of an extended inode ref + */ +static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, + int len) +{ + return (u64) crc32c(parent_objectid, name, len); +} + +static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) +{ + return mapping_gfp_constraint(mapping, ~__GFP_FS); +} + +int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, + u64 start, u64 end); +int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, + u64 num_bytes, u64 *actual_bytes); +int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); + +/* ctree.c */ +int __init btrfs_ctree_init(void); +void __cold btrfs_ctree_exit(void); + +int btrfs_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot); + +int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); +int btrfs_previous_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid, + int type); +int btrfs_previous_extent_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid); +void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + const struct btrfs_key *new_key); +struct extent_buffer *btrfs_root_node(struct btrfs_root *root); +int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, int lowest_level, + u64 min_trans); +int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_path *path, + u64 min_trans); +struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, + int slot); + +int btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + enum btrfs_lock_nesting nest); +int btrfs_copy_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer **cow_ret, u64 new_root_objectid); +int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf); +int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot); +void btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u32 data_size); +void btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u32 new_size, int from_end); +int btrfs_split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *new_key, + unsigned long split_offset); +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *new_key); +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, + u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key); +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *key, struct btrfs_path *p, + int ins_len, int cow); +int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, + struct btrfs_path *p, u64 time_seq); +int btrfs_search_slot_for_read(struct btrfs_root *root, + const struct btrfs_key *key, + struct btrfs_path *p, int find_higher, + int return_any); +int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *parent, + int start_slot, u64 *last_ret, + struct btrfs_key *progress); +void btrfs_release_path(struct btrfs_path *p); +struct btrfs_path *btrfs_alloc_path(void); +void btrfs_free_path(struct btrfs_path *p); + +int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int slot, int nr); +static inline int btrfs_del_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + return btrfs_del_items(trans, root, path, path->slots[0], 1); +} + +/* + * Describes a batch of items to insert in a btree. This is used by + * btrfs_insert_empty_items(). + */ +struct btrfs_item_batch { + /* + * Pointer to an array containing the keys of the items to insert (in + * sorted order). + */ + const struct btrfs_key *keys; + /* Pointer to an array containing the data size for each item to insert. */ + const u32 *data_sizes; + /* + * The sum of data sizes for all items. The caller can compute this while + * setting up the data_sizes array, so it ends up being more efficient + * than having btrfs_insert_empty_items() or setup_item_for_insert() + * doing it, as it would avoid an extra loop over a potentially large + * array, and in the case of setup_item_for_insert(), we would be doing + * it while holding a write lock on a leaf and often on upper level nodes + * too, unnecessarily increasing the size of a critical section. + */ + u32 total_data_size; + /* Size of the keys and data_sizes arrays (number of items in the batch). */ + int nr; +}; + +void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *key, + u32 data_size); +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *key, void *data, u32 data_size); +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_item_batch *batch); + +static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *key, + u32 data_size) +{ + struct btrfs_item_batch batch; + + batch.keys = key; + batch.data_sizes = &data_size; + batch.total_data_size = data_size; + batch.nr = 1; + + return btrfs_insert_empty_items(trans, root, path, &batch); +} + +int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, + u64 time_seq); + +int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path); + +int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path); + +/* + * Search in @root for a given @key, and store the slot found in @found_key. + * + * @root: The root node of the tree. + * @key: The key we are looking for. + * @found_key: Will hold the found item. + * @path: Holds the current slot/leaf. + * @iter_ret: Contains the value returned from btrfs_search_slot or + * btrfs_get_next_valid_item, whichever was executed last. + * + * The @iter_ret is an output variable that will contain the return value of + * btrfs_search_slot, if it encountered an error, or the value returned from + * btrfs_get_next_valid_item otherwise. That return value can be 0, if a valid + * slot was found, 1 if there were no more leaves, and <0 if there was an error. + * + * It's recommended to use a separate variable for iter_ret and then use it to + * set the function return value so there's no confusion of the 0/1/errno + * values stemming from btrfs_search_slot. + */ +#define btrfs_for_each_slot(root, key, found_key, path, iter_ret) \ + for (iter_ret = btrfs_search_slot(NULL, (root), (key), (path), 0, 0); \ + (iter_ret) >= 0 && \ + (iter_ret = btrfs_get_next_valid_item((root), (found_key), (path))) == 0; \ + (path)->slots[0]++ \ + ) + +int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); + +/* + * Search the tree again to find a leaf with greater keys. + * + * Returns 0 if it found something or 1 if there are no greater leaves. + * Returns < 0 on error. + */ +static inline int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + return btrfs_next_old_leaf(root, path, 0); +} + +static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) +{ + return btrfs_next_old_item(root, p, 0); +} +int btrfs_leaf_free_space(const struct extent_buffer *leaf); + +static inline int is_fstree(u64 rootid) +{ + if (rootid == BTRFS_FS_TREE_OBJECTID || + ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID && + !btrfs_qgroup_level(rootid))) + return 1; + return 0; +} + +static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) +{ + return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; +} + +u16 btrfs_csum_type_size(u16 type); +int btrfs_super_csum_size(const struct btrfs_super_block *s); +const char *btrfs_super_csum_name(u16 csum_type); +const char *btrfs_super_csum_driver(u16 csum_type); +size_t __attribute_const__ btrfs_get_num_csums(void); + +/* + * We use page status Private2 to indicate there is an ordered extent with + * unfinished IO. + * + * Rename the Private2 accessors to Ordered, to improve readability. + */ +#define PageOrdered(page) PagePrivate2(page) +#define SetPageOrdered(page) SetPagePrivate2(page) +#define ClearPageOrdered(page) ClearPagePrivate2(page) +#define folio_test_ordered(folio) folio_test_private_2(folio) +#define folio_set_ordered(folio) folio_set_private_2(folio) +#define folio_clear_ordered(folio) folio_clear_private_2(folio) + +#endif diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c new file mode 100644 index 0000000000..f2ff4cbe86 --- /dev/null +++ b/fs/btrfs/defrag.c @@ -0,0 +1,1379 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "transaction.h" +#include "locking.h" +#include "accessors.h" +#include "messages.h" +#include "delalloc-space.h" +#include "subpage.h" +#include "defrag.h" +#include "file-item.h" +#include "super.h" + +static struct kmem_cache *btrfs_inode_defrag_cachep; + +/* + * When auto defrag is enabled we queue up these defrag structs to remember + * which inodes need defragging passes. + */ +struct inode_defrag { + struct rb_node rb_node; + /* Inode number */ + u64 ino; + /* + * Transid where the defrag was added, we search for extents newer than + * this. + */ + u64 transid; + + /* Root objectid */ + u64 root; + + /* + * The extent size threshold for autodefrag. + * + * This value is different for compressed/non-compressed extents, thus + * needs to be passed from higher layer. + * (aka, inode_should_defrag()) + */ + u32 extent_thresh; +}; + +static int __compare_inode_defrag(struct inode_defrag *defrag1, + struct inode_defrag *defrag2) +{ + if (defrag1->root > defrag2->root) + return 1; + else if (defrag1->root < defrag2->root) + return -1; + else if (defrag1->ino > defrag2->ino) + return 1; + else if (defrag1->ino < defrag2->ino) + return -1; + else + return 0; +} + +/* + * Pop a record for an inode into the defrag tree. The lock must be held + * already. + * + * If you're inserting a record for an older transid than an existing record, + * the transid already in the tree is lowered. + * + * If an existing record is found the defrag item you pass in is freed. + */ +static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, + struct inode_defrag *defrag) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct inode_defrag *entry; + struct rb_node **p; + struct rb_node *parent = NULL; + int ret; + + p = &fs_info->defrag_inodes.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct inode_defrag, rb_node); + + ret = __compare_inode_defrag(defrag, entry); + if (ret < 0) + p = &parent->rb_left; + else if (ret > 0) + p = &parent->rb_right; + else { + /* + * If we're reinserting an entry for an old defrag run, + * make sure to lower the transid of our existing + * record. + */ + if (defrag->transid < entry->transid) + entry->transid = defrag->transid; + entry->extent_thresh = min(defrag->extent_thresh, + entry->extent_thresh); + return -EEXIST; + } + } + set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); + rb_link_node(&defrag->rb_node, parent, p); + rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); + return 0; +} + +static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) + return 0; + + if (btrfs_fs_closing(fs_info)) + return 0; + + return 1; +} + +/* + * Insert a defrag record for this inode if auto defrag is enabled. + */ +int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u32 extent_thresh) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode_defrag *defrag; + u64 transid; + int ret; + + if (!__need_auto_defrag(fs_info)) + return 0; + + if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) + return 0; + + if (trans) + transid = trans->transid; + else + transid = inode->root->last_trans; + + defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); + if (!defrag) + return -ENOMEM; + + defrag->ino = btrfs_ino(inode); + defrag->transid = transid; + defrag->root = root->root_key.objectid; + defrag->extent_thresh = extent_thresh; + + spin_lock(&fs_info->defrag_inodes_lock); + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { + /* + * If we set IN_DEFRAG flag and evict the inode from memory, + * and then re-read this inode, this new inode doesn't have + * IN_DEFRAG flag. At the case, we may find the existed defrag. + */ + ret = __btrfs_add_inode_defrag(inode, defrag); + if (ret) + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } else { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } + spin_unlock(&fs_info->defrag_inodes_lock); + return 0; +} + +/* + * Pick the defragable inode that we want, if it doesn't exist, we will get the + * next one. + */ +static struct inode_defrag *btrfs_pick_defrag_inode( + struct btrfs_fs_info *fs_info, u64 root, u64 ino) +{ + struct inode_defrag *entry = NULL; + struct inode_defrag tmp; + struct rb_node *p; + struct rb_node *parent = NULL; + int ret; + + tmp.ino = ino; + tmp.root = root; + + spin_lock(&fs_info->defrag_inodes_lock); + p = fs_info->defrag_inodes.rb_node; + while (p) { + parent = p; + entry = rb_entry(parent, struct inode_defrag, rb_node); + + ret = __compare_inode_defrag(&tmp, entry); + if (ret < 0) + p = parent->rb_left; + else if (ret > 0) + p = parent->rb_right; + else + goto out; + } + + if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + parent = rb_next(parent); + if (parent) + entry = rb_entry(parent, struct inode_defrag, rb_node); + else + entry = NULL; + } +out: + if (entry) + rb_erase(parent, &fs_info->defrag_inodes); + spin_unlock(&fs_info->defrag_inodes_lock); + return entry; +} + +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + struct rb_node *node; + + spin_lock(&fs_info->defrag_inodes_lock); + node = rb_first(&fs_info->defrag_inodes); + while (node) { + rb_erase(node, &fs_info->defrag_inodes); + defrag = rb_entry(node, struct inode_defrag, rb_node); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + + cond_resched_lock(&fs_info->defrag_inodes_lock); + + node = rb_first(&fs_info->defrag_inodes); + } + spin_unlock(&fs_info->defrag_inodes_lock); +} + +#define BTRFS_DEFRAG_BATCH 1024 + +static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag) +{ + struct btrfs_root *inode_root; + struct inode *inode; + struct btrfs_ioctl_defrag_range_args range; + int ret = 0; + u64 cur = 0; + +again: + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + goto cleanup; + if (!__need_auto_defrag(fs_info)) + goto cleanup; + + /* Get the inode */ + inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); + if (IS_ERR(inode_root)) { + ret = PTR_ERR(inode_root); + goto cleanup; + } + + inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); + btrfs_put_root(inode_root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto cleanup; + } + + if (cur >= i_size_read(inode)) { + iput(inode); + goto cleanup; + } + + /* Do a chunk of defrag */ + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); + memset(&range, 0, sizeof(range)); + range.len = (u64)-1; + range.start = cur; + range.extent_thresh = defrag->extent_thresh; + + sb_start_write(fs_info->sb); + ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + BTRFS_DEFRAG_BATCH); + sb_end_write(fs_info->sb); + iput(inode); + + if (ret < 0) + goto cleanup; + + cur = max(cur + fs_info->sectorsize, range.start); + goto again; + +cleanup: + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return ret; +} + +/* + * Run through the list of inodes in the FS that need defragging. + */ +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + u64 first_ino = 0; + u64 root_objectid = 0; + + atomic_inc(&fs_info->defrag_running); + while (1) { + /* Pause the auto defragger. */ + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + break; + + if (!__need_auto_defrag(fs_info)) + break; + + /* find an inode to defrag */ + defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); + if (!defrag) { + if (root_objectid || first_ino) { + root_objectid = 0; + first_ino = 0; + continue; + } else { + break; + } + } + + first_ino = defrag->ino + 1; + root_objectid = defrag->root; + + __btrfs_run_defrag_inode(fs_info, defrag); + } + atomic_dec(&fs_info->defrag_running); + + /* + * During unmount, we use the transaction_wait queue to wait for the + * defragger to stop. + */ + wake_up(&fs_info->transaction_wait); + return 0; +} + +/* + * Defrag all the leaves in a given btree. + * Read all the leaves and try to get key order to + * better reflect disk order + */ + +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path = NULL; + struct btrfs_key key; + int ret = 0; + int wret; + int level; + int next_key_ret = 0; + u64 last_ret = 0; + + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + goto out; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + level = btrfs_header_level(root->node); + + if (level == 0) + goto out; + + if (root->defrag_progress.objectid == 0) { + struct extent_buffer *root_node; + u32 nritems; + + root_node = btrfs_lock_root_node(root); + nritems = btrfs_header_nritems(root_node); + root->defrag_max.objectid = 0; + /* from above we know this is not a leaf */ + btrfs_node_key_to_cpu(root_node, &root->defrag_max, + nritems - 1); + btrfs_tree_unlock(root_node); + free_extent_buffer(root_node); + memset(&key, 0, sizeof(key)); + } else { + memcpy(&key, &root->defrag_progress, sizeof(key)); + } + + path->keep_locks = 1; + + ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + goto out; + } + btrfs_release_path(path); + /* + * We don't need a lock on a leaf. btrfs_realloc_node() will lock all + * leafs from path->nodes[1], so set lowest_level to 1 to avoid later + * a deadlock (attempting to write lock an already write locked leaf). + */ + path->lowest_level = 1; + wret = btrfs_search_slot(trans, root, &key, path, 0, 1); + + if (wret < 0) { + ret = wret; + goto out; + } + if (!path->nodes[1]) { + ret = 0; + goto out; + } + /* + * The node at level 1 must always be locked when our path has + * keep_locks set and lowest_level is 1, regardless of the value of + * path->slots[1]. + */ + BUG_ON(path->locks[1] == 0); + ret = btrfs_realloc_node(trans, root, + path->nodes[1], 0, + &last_ret, + &root->defrag_progress); + if (ret) { + WARN_ON(ret == -EAGAIN); + goto out; + } + /* + * Now that we reallocated the node we can find the next key. Note that + * btrfs_find_next_key() can release our path and do another search + * without COWing, this is because even with path->keep_locks = 1, + * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a + * node when path->slots[node_level - 1] does not point to the last + * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore + * we search for the next key after reallocating our node. + */ + path->slots[1] = btrfs_header_nritems(path->nodes[1]); + next_key_ret = btrfs_find_next_key(root, path, &key, 1, + BTRFS_OLDEST_GENERATION); + if (next_key_ret == 0) { + memcpy(&root->defrag_progress, &key, sizeof(key)); + ret = -EAGAIN; + } +out: + btrfs_free_path(path); + if (ret == -EAGAIN) { + if (root->defrag_max.objectid > root->defrag_progress.objectid) + goto done; + if (root->defrag_max.type > root->defrag_progress.type) + goto done; + if (root->defrag_max.offset > root->defrag_progress.offset) + goto done; + ret = 0; + } +done: + if (ret != -EAGAIN) + memset(&root->defrag_progress, 0, + sizeof(root->defrag_progress)); + + return ret; +} + +/* + * Defrag specific helper to get an extent map. + * + * Differences between this and btrfs_get_extent() are: + * + * - No extent_map will be added to inode->extent_tree + * To reduce memory usage in the long run. + * + * - Extra optimization to skip file extents older than @newer_than + * By using btrfs_search_forward() we can skip entire file ranges that + * have extents created in past transactions, because btrfs_search_forward() + * will not visit leaves and nodes with a generation smaller than given + * minimal generation threshold (@newer_than). + * + * Return valid em if we find a file extent matching the requirement. + * Return NULL if we can not find a file extent matching the requirement. + * + * Return ERR_PTR() for error. + */ +static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, + u64 start, u64 newer_than) +{ + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *fi; + struct btrfs_path path = { 0 }; + struct extent_map *em; + struct btrfs_key key; + u64 ino = btrfs_ino(inode); + int ret; + + em = alloc_extent_map(); + if (!em) { + ret = -ENOMEM; + goto err; + } + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + if (newer_than) { + ret = btrfs_search_forward(root, &key, &path, newer_than); + if (ret < 0) + goto err; + /* Can't find anything newer */ + if (ret > 0) + goto not_found; + } else { + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); + if (ret < 0) + goto err; + } + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { + /* + * If btrfs_search_slot() makes path to point beyond nritems, + * we should not have an empty leaf, as this inode must at + * least have its INODE_ITEM. + */ + ASSERT(btrfs_header_nritems(path.nodes[0])); + path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; + } + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + /* Perfect match, no need to go one slot back */ + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && + key.offset == start) + goto iterate; + + /* We didn't find a perfect match, needs to go one slot back */ + if (path.slots[0] > 0) { + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path.slots[0]--; + } + +iterate: + /* Iterate through the path to find a file extent covering @start */ + while (true) { + u64 extent_end; + + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) + goto next; + + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + + /* + * We may go one slot back to INODE_REF/XATTR item, then + * need to go forward until we reach an EXTENT_DATA. + * But we should still has the correct ino as key.objectid. + */ + if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) + goto next; + + /* It's beyond our target range, definitely not extent found */ + if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) + goto not_found; + + /* + * | |<- File extent ->| + * \- start + * + * This means there is a hole between start and key.offset. + */ + if (key.offset > start) { + em->start = start; + em->orig_start = start; + em->block_start = EXTENT_MAP_HOLE; + em->len = key.offset - start; + break; + } + + fi = btrfs_item_ptr(path.nodes[0], path.slots[0], + struct btrfs_file_extent_item); + extent_end = btrfs_file_extent_end(&path); + + /* + * |<- file extent ->| | + * \- start + * + * We haven't reached start, search next slot. + */ + if (extent_end <= start) + goto next; + + /* Now this extent covers @start, convert it to em */ + btrfs_extent_item_to_extent_map(inode, &path, fi, em); + break; +next: + ret = btrfs_next_item(root, &path); + if (ret < 0) + goto err; + if (ret > 0) + goto not_found; + } + btrfs_release_path(&path); + return em; + +not_found: + btrfs_release_path(&path); + free_extent_map(em); + return NULL; + +err: + btrfs_release_path(&path); + free_extent_map(em); + return ERR_PTR(ret); +} + +static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, + u64 newer_than, bool locked) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em; + const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; + + /* + * Hopefully we have this extent in the tree already, try without the + * full extent lock. + */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, sectorsize); + read_unlock(&em_tree->lock); + + /* + * We can get a merged extent, in that case, we need to re-search + * tree to get the original em for defrag. + * + * If @newer_than is 0 or em::generation < newer_than, we can trust + * this em, as either we don't care about the generation, or the + * merged extent map will be rejected anyway. + */ + if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && + newer_than && em->generation >= newer_than) { + free_extent_map(em); + em = NULL; + } + + if (!em) { + struct extent_state *cached = NULL; + u64 end = start + sectorsize - 1; + + /* Get the big lock and read metadata off disk. */ + if (!locked) + lock_extent(io_tree, start, end, &cached); + em = defrag_get_extent(BTRFS_I(inode), start, newer_than); + if (!locked) + unlock_extent(io_tree, start, end, &cached); + + if (IS_ERR(em)) + return NULL; + } + + return em; +} + +static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, + const struct extent_map *em) +{ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + return BTRFS_MAX_COMPRESSED; + return fs_info->max_extent_size; +} + +static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, + u32 extent_thresh, u64 newer_than, bool locked) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct extent_map *next; + bool ret = false; + + /* This is the last extent */ + if (em->start + em->len >= i_size_read(inode)) + return false; + + /* + * Here we need to pass @newer_then when checking the next extent, or + * we will hit a case we mark current extent for defrag, but the next + * one will not be a target. + * This will just cause extra IO without really reducing the fragments. + */ + next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); + /* No more em or hole */ + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + goto out; + if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) + goto out; + /* + * If the next extent is at its max capacity, defragging current extent + * makes no sense, as the total number of extents won't change. + */ + if (next->len >= get_extent_max_capacity(fs_info, em)) + goto out; + /* Skip older extent */ + if (next->generation < newer_than) + goto out; + /* Also check extent size */ + if (next->len >= extent_thresh) + goto out; + + ret = true; +out: + free_extent_map(next); + return ret; +} + +/* + * Prepare one page to be defragged. + * + * This will ensure: + * + * - Returned page is locked and has been set up properly. + * - No ordered extent exists in the page. + * - The page is uptodate. + * + * NOTE: Caller should also wait for page writeback after the cluster is + * prepared, here we don't do writeback wait for each page. + */ +static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index) +{ + struct address_space *mapping = inode->vfs_inode.i_mapping; + gfp_t mask = btrfs_alloc_write_mask(mapping); + u64 page_start = (u64)index << PAGE_SHIFT; + u64 page_end = page_start + PAGE_SIZE - 1; + struct extent_state *cached_state = NULL; + struct page *page; + int ret; + +again: + page = find_or_create_page(mapping, index, mask); + if (!page) + return ERR_PTR(-ENOMEM); + + /* + * Since we can defragment files opened read-only, we can encounter + * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We + * can't do I/O using huge pages yet, so return an error for now. + * Filesystem transparent huge pages are typically only used for + * executables that explicitly enable them, so this isn't very + * restrictive. + */ + if (PageCompound(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-ETXTBSY); + } + + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + return ERR_PTR(ret); + } + + /* Wait for any existing ordered extent in the range */ + while (1) { + struct btrfs_ordered_extent *ordered; + + lock_extent(&inode->io_tree, page_start, page_end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); + unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); + if (!ordered) + break; + + unlock_page(page); + btrfs_start_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + lock_page(page); + /* + * We unlocked the page above, so we need check if it was + * released or not. + */ + if (page->mapping != mapping || !PagePrivate(page)) { + unlock_page(page); + put_page(page); + goto again; + } + } + + /* + * Now the page range has no ordered extent any more. Read the page to + * make it uptodate. + */ + if (!PageUptodate(page)) { + btrfs_read_folio(NULL, page_folio(page)); + lock_page(page); + if (page->mapping != mapping || !PagePrivate(page)) { + unlock_page(page); + put_page(page); + goto again; + } + if (!PageUptodate(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-EIO); + } + } + return page; +} + +struct defrag_target_range { + struct list_head list; + u64 start; + u64 len; +}; + +/* + * Collect all valid target extents. + * + * @start: file offset to lookup + * @len: length to lookup + * @extent_thresh: file extent size threshold, any extent size >= this value + * will be ignored + * @newer_than: only defrag extents newer than this value + * @do_compress: whether the defrag is doing compression + * if true, @extent_thresh will be ignored and all regular + * file extents meeting @newer_than will be targets. + * @locked: if the range has already held extent lock + * @target_list: list of targets file extents + */ +static int defrag_collect_targets(struct btrfs_inode *inode, + u64 start, u64 len, u32 extent_thresh, + u64 newer_than, bool do_compress, + bool locked, struct list_head *target_list, + u64 *last_scanned_ret) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + bool last_is_target = false; + u64 cur = start; + int ret = 0; + + while (cur < start + len) { + struct extent_map *em; + struct defrag_target_range *new; + bool next_mergeable = true; + u64 range_len; + + last_is_target = false; + em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked); + if (!em) + break; + + /* + * If the file extent is an inlined one, we may still want to + * defrag it (fallthrough) if it will cause a regular extent. + * This is for users who want to convert inline extents to + * regular ones through max_inline= mount option. + */ + if (em->block_start == EXTENT_MAP_INLINE && + em->len <= inode->root->fs_info->max_inline) + goto next; + + /* Skip hole/delalloc/preallocated extents */ + if (em->block_start == EXTENT_MAP_HOLE || + em->block_start == EXTENT_MAP_DELALLOC || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + goto next; + + /* Skip older extent */ + if (em->generation < newer_than) + goto next; + + /* This em is under writeback, no need to defrag */ + if (em->generation == (u64)-1) + goto next; + + /* + * Our start offset might be in the middle of an existing extent + * map, so take that into account. + */ + range_len = em->len - (cur - em->start); + /* + * If this range of the extent map is already flagged for delalloc, + * skip it, because: + * + * 1) We could deadlock later, when trying to reserve space for + * delalloc, because in case we can't immediately reserve space + * the flusher can start delalloc and wait for the respective + * ordered extents to complete. The deadlock would happen + * because we do the space reservation while holding the range + * locked, and starting writeback, or finishing an ordered + * extent, requires locking the range; + * + * 2) If there's delalloc there, it means there's dirty pages for + * which writeback has not started yet (we clean the delalloc + * flag when starting writeback and after creating an ordered + * extent). If we mark pages in an adjacent range for defrag, + * then we will have a larger contiguous range for delalloc, + * very likely resulting in a larger extent after writeback is + * triggered (except in a case of free space fragmentation). + */ + if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC, 0, NULL)) + goto next; + + /* + * For do_compress case, we want to compress all valid file + * extents, thus no @extent_thresh or mergeable check. + */ + if (do_compress) + goto add; + + /* Skip too large extent */ + if (range_len >= extent_thresh) + goto next; + + /* + * Skip extents already at its max capacity, this is mostly for + * compressed extents, which max cap is only 128K. + */ + if (em->len >= get_extent_max_capacity(fs_info, em)) + goto next; + + /* + * Normally there are no more extents after an inline one, thus + * @next_mergeable will normally be false and not defragged. + * So if an inline extent passed all above checks, just add it + * for defrag, and be converted to regular extents. + */ + if (em->block_start == EXTENT_MAP_INLINE) + goto add; + + next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, + extent_thresh, newer_than, locked); + if (!next_mergeable) { + struct defrag_target_range *last; + + /* Empty target list, no way to merge with last entry */ + if (list_empty(target_list)) + goto next; + last = list_entry(target_list->prev, + struct defrag_target_range, list); + /* Not mergeable with last entry */ + if (last->start + last->len != cur) + goto next; + + /* Mergeable, fall through to add it to @target_list. */ + } + +add: + last_is_target = true; + range_len = min(extent_map_end(em), start + len) - cur; + /* + * This one is a good target, check if it can be merged into + * last range of the target list. + */ + if (!list_empty(target_list)) { + struct defrag_target_range *last; + + last = list_entry(target_list->prev, + struct defrag_target_range, list); + ASSERT(last->start + last->len <= cur); + if (last->start + last->len == cur) { + /* Mergeable, enlarge the last entry */ + last->len += range_len; + goto next; + } + /* Fall through to allocate a new entry */ + } + + /* Allocate new defrag_target_range */ + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) { + free_extent_map(em); + ret = -ENOMEM; + break; + } + new->start = cur; + new->len = range_len; + list_add_tail(&new->list, target_list); + +next: + cur = extent_map_end(em); + free_extent_map(em); + } + if (ret < 0) { + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + + list_for_each_entry_safe(entry, tmp, target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } + } + if (!ret && last_scanned_ret) { + /* + * If the last extent is not a target, the caller can skip to + * the end of that extent. + * Otherwise, we can only go the end of the specified range. + */ + if (!last_is_target) + *last_scanned_ret = max(cur, *last_scanned_ret); + else + *last_scanned_ret = max(start + len, *last_scanned_ret); + } + return ret; +} + +#define CLUSTER_SIZE (SZ_256K) +static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); + +/* + * Defrag one contiguous target range. + * + * @inode: target inode + * @target: target range to defrag + * @pages: locked pages covering the defrag range + * @nr_pages: number of locked pages + * + * Caller should ensure: + * + * - Pages are prepared + * Pages should be locked, no ordered extent in the pages range, + * no writeback. + * + * - Extent bits are locked + */ +static int defrag_one_locked_target(struct btrfs_inode *inode, + struct defrag_target_range *target, + struct page **pages, int nr_pages, + struct extent_state **cached_state) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_changeset *data_reserved = NULL; + const u64 start = target->start; + const u64 len = target->len; + unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; + unsigned long start_index = start >> PAGE_SHIFT; + unsigned long first_index = page_index(pages[0]); + int ret = 0; + int i; + + ASSERT(last_index - first_index + 1 <= nr_pages); + + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); + if (ret < 0) + return ret; + clear_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, cached_state); + set_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); + + /* Update the page status */ + for (i = start_index - first_index; i <= last_index - first_index; i++) { + ClearPageChecked(pages[i]); + btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); + } + btrfs_delalloc_release_extents(inode, len); + extent_changeset_free(data_reserved); + + return ret; +} + +static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, + u32 extent_thresh, u64 newer_than, bool do_compress, + u64 *last_scanned_ret) +{ + struct extent_state *cached_state = NULL; + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + LIST_HEAD(target_list); + struct page **pages; + const u32 sectorsize = inode->root->fs_info->sectorsize; + u64 last_index = (start + len - 1) >> PAGE_SHIFT; + u64 start_index = start >> PAGE_SHIFT; + unsigned int nr_pages = last_index - start_index + 1; + int ret = 0; + int i; + + ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); + ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); + + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + + /* Prepare all pages */ + for (i = 0; i < nr_pages; i++) { + pages[i] = defrag_prepare_one_page(inode, start_index + i); + if (IS_ERR(pages[i])) { + ret = PTR_ERR(pages[i]); + pages[i] = NULL; + goto free_pages; + } + } + for (i = 0; i < nr_pages; i++) + wait_on_page_writeback(pages[i]); + + /* Lock the pages range */ + lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); + /* + * Now we have a consistent view about the extent map, re-check + * which range really needs to be defragged. + * + * And this time we have extent locked already, pass @locked = true + * so that we won't relock the extent range and cause deadlock. + */ + ret = defrag_collect_targets(inode, start, len, extent_thresh, + newer_than, do_compress, true, + &target_list, last_scanned_ret); + if (ret < 0) + goto unlock_extent; + + list_for_each_entry(entry, &target_list, list) { + ret = defrag_one_locked_target(inode, entry, pages, nr_pages, + &cached_state); + if (ret < 0) + break; + } + + list_for_each_entry_safe(entry, tmp, &target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } +unlock_extent: + unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); +free_pages: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) { + unlock_page(pages[i]); + put_page(pages[i]); + } + } + kfree(pages); + return ret; +} + +static int defrag_one_cluster(struct btrfs_inode *inode, + struct file_ra_state *ra, + u64 start, u32 len, u32 extent_thresh, + u64 newer_than, bool do_compress, + unsigned long *sectors_defragged, + unsigned long max_sectors, + u64 *last_scanned_ret) +{ + const u32 sectorsize = inode->root->fs_info->sectorsize; + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + LIST_HEAD(target_list); + int ret; + + ret = defrag_collect_targets(inode, start, len, extent_thresh, + newer_than, do_compress, false, + &target_list, NULL); + if (ret < 0) + goto out; + + list_for_each_entry(entry, &target_list, list) { + u32 range_len = entry->len; + + /* Reached or beyond the limit */ + if (max_sectors && *sectors_defragged >= max_sectors) { + ret = 1; + break; + } + + if (max_sectors) + range_len = min_t(u32, range_len, + (max_sectors - *sectors_defragged) * sectorsize); + + /* + * If defrag_one_range() has updated last_scanned_ret, + * our range may already be invalid (e.g. hole punched). + * Skip if our range is before last_scanned_ret, as there is + * no need to defrag the range anymore. + */ + if (entry->start + range_len <= *last_scanned_ret) + continue; + + if (ra) + page_cache_sync_readahead(inode->vfs_inode.i_mapping, + ra, NULL, entry->start >> PAGE_SHIFT, + ((entry->start + range_len - 1) >> PAGE_SHIFT) - + (entry->start >> PAGE_SHIFT) + 1); + /* + * Here we may not defrag any range if holes are punched before + * we locked the pages. + * But that's fine, it only affects the @sectors_defragged + * accounting. + */ + ret = defrag_one_range(inode, entry->start, range_len, + extent_thresh, newer_than, do_compress, + last_scanned_ret); + if (ret < 0) + break; + *sectors_defragged += range_len >> + inode->root->fs_info->sectorsize_bits; + } +out: + list_for_each_entry_safe(entry, tmp, &target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } + if (ret >= 0) + *last_scanned_ret = max(*last_scanned_ret, start + len); + return ret; +} + +/* + * Entry point to file defragmentation. + * + * @inode: inode to be defragged + * @ra: readahead state (can be NUL) + * @range: defrag options including range and flags + * @newer_than: minimum transid to defrag + * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode + * will be defragged. + * + * Return <0 for error. + * Return >=0 for the number of sectors defragged, and range->start will be updated + * to indicate the file offset where next defrag should be started at. + * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without + * defragging all the range). + */ +int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_to_defrag) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + unsigned long sectors_defragged = 0; + u64 isize = i_size_read(inode); + u64 cur; + u64 last_byte; + bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); + bool ra_allocated = false; + int compress_type = BTRFS_COMPRESS_ZLIB; + int ret = 0; + u32 extent_thresh = range->extent_thresh; + pgoff_t start_index; + + if (isize == 0) + return 0; + + if (range->start >= isize) + return -EINVAL; + + if (do_compress) { + if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } + + if (extent_thresh == 0) + extent_thresh = SZ_256K; + + if (range->start + range->len > range->start) { + /* Got a specific range */ + last_byte = min(isize, range->start + range->len); + } else { + /* Defrag until file end */ + last_byte = isize; + } + + /* Align the range */ + cur = round_down(range->start, fs_info->sectorsize); + last_byte = round_up(last_byte, fs_info->sectorsize) - 1; + + /* + * If we were not given a ra, allocate a readahead context. As + * readahead is just an optimization, defrag will work without it so + * we don't error out. + */ + if (!ra) { + ra_allocated = true; + ra = kzalloc(sizeof(*ra), GFP_KERNEL); + if (ra) + file_ra_state_init(ra, inode->i_mapping); + } + + /* + * Make writeback start from the beginning of the range, so that the + * defrag range can be written sequentially. + */ + start_index = cur >> PAGE_SHIFT; + if (start_index < inode->i_mapping->writeback_index) + inode->i_mapping->writeback_index = start_index; + + while (cur < last_byte) { + const unsigned long prev_sectors_defragged = sectors_defragged; + u64 last_scanned = cur; + u64 cluster_end; + + if (btrfs_defrag_cancelled(fs_info)) { + ret = -EAGAIN; + break; + } + + /* We want the cluster end at page boundary when possible */ + cluster_end = (((cur >> PAGE_SHIFT) + + (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; + cluster_end = min(cluster_end, last_byte); + + btrfs_inode_lock(BTRFS_I(inode), 0); + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + btrfs_inode_unlock(BTRFS_I(inode), 0); + break; + } + if (!(inode->i_sb->s_flags & SB_ACTIVE)) { + btrfs_inode_unlock(BTRFS_I(inode), 0); + break; + } + if (do_compress) + BTRFS_I(inode)->defrag_compress = compress_type; + ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, + cluster_end + 1 - cur, extent_thresh, + newer_than, do_compress, §ors_defragged, + max_to_defrag, &last_scanned); + + if (sectors_defragged > prev_sectors_defragged) + balance_dirty_pages_ratelimited(inode->i_mapping); + + btrfs_inode_unlock(BTRFS_I(inode), 0); + if (ret < 0) + break; + cur = max(cluster_end + 1, last_scanned); + if (ret > 0) { + ret = 0; + break; + } + cond_resched(); + } + + if (ra_allocated) + kfree(ra); + /* + * Update range.start for autodefrag, this will indicate where to start + * in next run. + */ + range->start = cur; + if (sectors_defragged) { + /* + * We have defragged some sectors, for compression case they + * need to be written back immediately. + */ + if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { + filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } + if (range->compress_type == BTRFS_COMPRESS_LZO) + btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); + else if (range->compress_type == BTRFS_COMPRESS_ZSTD) + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); + ret = sectors_defragged; + } + if (do_compress) { + btrfs_inode_lock(BTRFS_I(inode), 0); + BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; + btrfs_inode_unlock(BTRFS_I(inode), 0); + } + return ret; +} + +void __cold btrfs_auto_defrag_exit(void) +{ + kmem_cache_destroy(btrfs_inode_defrag_cachep); +} + +int __init btrfs_auto_defrag_init(void) +{ + btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", + sizeof(struct inode_defrag), 0, + SLAB_MEM_SPREAD, + NULL); + if (!btrfs_inode_defrag_cachep) + return -ENOMEM; + + return 0; +} diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h new file mode 100644 index 0000000000..5305f2283b --- /dev/null +++ b/fs/btrfs/defrag.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DEFRAG_H +#define BTRFS_DEFRAG_H + +int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_to_defrag); +int __init btrfs_auto_defrag_init(void); +void __cold btrfs_auto_defrag_exit(void); +int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u32 extent_thresh); +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); + +static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) +{ + return signal_pending(current); +} + +#endif diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c new file mode 100644 index 0000000000..eef341bbcc --- /dev/null +++ b/fs/btrfs/delalloc-space.c @@ -0,0 +1,496 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "messages.h" +#include "ctree.h" +#include "delalloc-space.h" +#include "block-rsv.h" +#include "btrfs_inode.h" +#include "space-info.h" +#include "transaction.h" +#include "qgroup.h" +#include "block-group.h" +#include "fs.h" + +/* + * HOW DOES THIS WORK + * + * There are two stages to data reservations, one for data and one for metadata + * to handle the new extents and checksums generated by writing data. + * + * + * DATA RESERVATION + * The general flow of the data reservation is as follows + * + * -> Reserve + * We call into btrfs_reserve_data_bytes() for the user request bytes that + * they wish to write. We make this reservation and add it to + * space_info->bytes_may_use. We set EXTENT_DELALLOC on the inode io_tree + * for the range and carry on if this is buffered, or follow up trying to + * make a real allocation if we are pre-allocating or doing O_DIRECT. + * + * -> Use + * At writepages()/prealloc/O_DIRECT time we will call into + * btrfs_reserve_extent() for some part or all of this range of bytes. We + * will make the allocation and subtract space_info->bytes_may_use by the + * original requested length and increase the space_info->bytes_reserved by + * the allocated length. This distinction is important because compression + * may allocate a smaller on disk extent than we previously reserved. + * + * -> Allocation + * finish_ordered_io() will insert the new file extent item for this range, + * and then add a delayed ref update for the extent tree. Once that delayed + * ref is written the extent size is subtracted from + * space_info->bytes_reserved and added to space_info->bytes_used. + * + * Error handling + * + * -> By the reservation maker + * This is the simplest case, we haven't completed our operation and we know + * how much we reserved, we can simply call + * btrfs_free_reserved_data_space*() and it will be removed from + * space_info->bytes_may_use. + * + * -> After the reservation has been made, but before cow_file_range() + * This is specifically for the delalloc case. You must clear + * EXTENT_DELALLOC with the EXTENT_CLEAR_DATA_RESV bit, and the range will + * be subtracted from space_info->bytes_may_use. + * + * METADATA RESERVATION + * The general metadata reservation lifetimes are discussed elsewhere, this + * will just focus on how it is used for delalloc space. + * + * We keep track of two things on a per inode bases + * + * ->outstanding_extents + * This is the number of file extent items we'll need to handle all of the + * outstanding DELALLOC space we have in this inode. We limit the maximum + * size of an extent, so a large contiguous dirty area may require more than + * one outstanding_extent, which is why count_max_extents() is used to + * determine how many outstanding_extents get added. + * + * ->csum_bytes + * This is essentially how many dirty bytes we have for this inode, so we + * can calculate the number of checksum items we would have to add in order + * to checksum our outstanding data. + * + * We keep a per-inode block_rsv in order to make it easier to keep track of + * our reservation. We use btrfs_calculate_inode_block_rsv_size() to + * calculate the current theoretical maximum reservation we would need for the + * metadata for this inode. We call this and then adjust our reservation as + * necessary, either by attempting to reserve more space, or freeing up excess + * space. + * + * OUTSTANDING_EXTENTS HANDLING + * + * ->outstanding_extents is used for keeping track of how many extents we will + * need to use for this inode, and it will fluctuate depending on where you are + * in the life cycle of the dirty data. Consider the following normal case for + * a completely clean inode, with a num_bytes < our maximum allowed extent size + * + * -> reserve + * ->outstanding_extents += 1 (current value is 1) + * + * -> set_delalloc + * ->outstanding_extents += 1 (current value is 2) + * + * -> btrfs_delalloc_release_extents() + * ->outstanding_extents -= 1 (current value is 1) + * + * We must call this once we are done, as we hold our reservation for the + * duration of our operation, and then assume set_delalloc will update the + * counter appropriately. + * + * -> add ordered extent + * ->outstanding_extents += 1 (current value is 2) + * + * -> btrfs_clear_delalloc_extent + * ->outstanding_extents -= 1 (current value is 1) + * + * -> finish_ordered_io/btrfs_remove_ordered_extent + * ->outstanding_extents -= 1 (current value is 0) + * + * Each stage is responsible for their own accounting of the extent, thus + * making error handling and cleanup easier. + */ + +int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA; + + /* Make sure bytes are sectorsize aligned */ + bytes = ALIGN(bytes, fs_info->sectorsize); + + if (btrfs_is_free_space_inode(inode)) + flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; + + return btrfs_reserve_data_bytes(fs_info, bytes, flush); +} + +int btrfs_check_data_free_space(struct btrfs_inode *inode, + struct extent_changeset **reserved, u64 start, + u64 len, bool noflush) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA; + int ret; + + /* align the range */ + len = round_up(start + len, fs_info->sectorsize) - + round_down(start, fs_info->sectorsize); + start = round_down(start, fs_info->sectorsize); + + if (noflush) + flush = BTRFS_RESERVE_NO_FLUSH; + else if (btrfs_is_free_space_inode(inode)) + flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; + + ret = btrfs_reserve_data_bytes(fs_info, len, flush); + if (ret < 0) + return ret; + + /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ + ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); + if (ret < 0) { + btrfs_free_reserved_data_space_noquota(fs_info, len); + extent_changeset_free(*reserved); + *reserved = NULL; + } else { + ret = 0; + } + return ret; +} + +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will *NOT* use accurate qgroup reserved space API, just for case + * which we can't sleep and is sure it won't affect qgroup reserved space. + * Like clear_bit_hook(). + */ +void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, + u64 len) +{ + struct btrfs_space_info *data_sinfo; + + ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); + + data_sinfo = fs_info->data_sinfo; + btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len); +} + +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will handle the per-inode data rsv map for accurate reserved + * space framework. + */ +void btrfs_free_reserved_data_space(struct btrfs_inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + /* Make sure the range is aligned to sectorsize */ + len = round_up(start + len, fs_info->sectorsize) - + round_down(start, fs_info->sectorsize); + start = round_down(start, fs_info->sectorsize); + + btrfs_free_reserved_data_space_noquota(fs_info, len); + btrfs_qgroup_free_data(inode, reserved, start, len, NULL); +} + +/* + * Release any excessive reservations for an inode. + * + * @inode: the inode we need to release from + * @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup + * meta reservation needs to know if we are freeing qgroup + * reservation or just converting it into per-trans. Normally + * @qgroup_free is true for error handling, and false for normal + * release. + * + * This is the same as btrfs_block_rsv_release, except that it handles the + * tracepoint for the reservation. + */ +static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 released = 0; + u64 qgroup_to_release = 0; + + /* + * Since we statically set the block_rsv->size we just want to say we + * are releasing 0 bytes, and then we'll just get the reservation over + * the size free'd. + */ + released = btrfs_block_rsv_release(fs_info, block_rsv, 0, + &qgroup_to_release); + if (released > 0) + trace_btrfs_space_reservation(fs_info, "delalloc", + btrfs_ino(inode), released, 0); + if (qgroup_free) + btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); + else + btrfs_qgroup_convert_reserved_meta(inode->root, + qgroup_to_release); +} + +static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) +{ + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 reserve_size = 0; + u64 qgroup_rsv_size = 0; + u64 csum_leaves; + unsigned outstanding_extents; + + lockdep_assert_held(&inode->lock); + outstanding_extents = inode->outstanding_extents; + + /* + * Insert size for the number of outstanding extents, 1 normal size for + * updating the inode. + */ + if (outstanding_extents) { + reserve_size = btrfs_calc_insert_metadata_size(fs_info, + outstanding_extents); + reserve_size += btrfs_calc_metadata_size(fs_info, 1); + } + csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, + inode->csum_bytes); + reserve_size += btrfs_calc_insert_metadata_size(fs_info, + csum_leaves); + /* + * For qgroup rsv, the calculation is very simple: + * account one nodesize for each outstanding extent + * + * This is overestimating in most cases. + */ + qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; + + spin_lock(&block_rsv->lock); + block_rsv->size = reserve_size; + block_rsv->qgroup_rsv_size = qgroup_rsv_size; + spin_unlock(&block_rsv->lock); +} + +static void calc_inode_reservations(struct btrfs_fs_info *fs_info, + u64 num_bytes, u64 disk_num_bytes, + u64 *meta_reserve, u64 *qgroup_reserve) +{ + u64 nr_extents = count_max_extents(fs_info, num_bytes); + u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); + u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); + + *meta_reserve = btrfs_calc_insert_metadata_size(fs_info, + nr_extents + csum_leaves); + + /* + * finish_ordered_io has to update the inode, so add the space required + * for an inode update. + */ + *meta_reserve += inode_update; + *qgroup_reserve = nr_extents * fs_info->nodesize; +} + +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + u64 disk_num_bytes, bool noflush) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 meta_reserve, qgroup_reserve; + unsigned nr_extents; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; + int ret = 0; + + /* + * If we are a free space inode we need to not flush since we will be in + * the middle of a transaction commit. We also don't need the delalloc + * mutex since we won't race with anybody. We need this mostly to make + * lockdep shut its filthy mouth. + * + * If we have a transaction open (can happen if we call truncate_block + * from truncate), then we need FLUSH_LIMIT so we don't deadlock. + */ + if (noflush || btrfs_is_free_space_inode(inode)) { + flush = BTRFS_RESERVE_NO_FLUSH; + } else { + if (current->journal_info) + flush = BTRFS_RESERVE_FLUSH_LIMIT; + } + + num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize); + + /* + * We always want to do it this way, every other way is wrong and ends + * in tears. Pre-reserving the amount we are going to add will always + * be the right way, because otherwise if we have enough parallelism we + * could end up with thousands of inodes all holding little bits of + * reservations they were able to make previously and the only way to + * reclaim that space is to ENOSPC out the operations and clear + * everything out and try again, which is bad. This way we just + * over-reserve slightly, and clean up the mess when we are done. + */ + calc_inode_reservations(fs_info, num_bytes, disk_num_bytes, + &meta_reserve, &qgroup_reserve); + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true, + noflush); + if (ret) + return ret; + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush); + if (ret) { + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); + return ret; + } + + /* + * Now we need to update our outstanding extents and csum bytes _first_ + * and then add the reservation to the block_rsv. This keeps us from + * racing with an ordered completion or some such that would think it + * needs to free the reservation we just made. + */ + nr_extents = count_max_extents(fs_info, num_bytes); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, nr_extents); + inode->csum_bytes += disk_num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + /* Now we can safely add our space to our block rsv */ + btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), meta_reserve, 1); + + spin_lock(&block_rsv->lock); + block_rsv->qgroup_rsv_reserved += qgroup_reserve; + spin_unlock(&block_rsv->lock); + + return 0; +} + +/* + * Release a metadata reservation for an inode. + * + * @inode: the inode to release the reservation for. + * @num_bytes: the number of bytes we are releasing. + * @qgroup_free: free qgroup reservation or convert it to per-trans reservation + * + * This will release the metadata reservation for an inode. This can be called + * once we complete IO for a given set of bytes to release their metadata + * reservations, or on error for the same reason. + */ +void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + spin_lock(&inode->lock); + inode->csum_bytes -= num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + if (btrfs_is_testing(fs_info)) + return; + + btrfs_inode_rsv_release(inode, qgroup_free); +} + +/* + * Release our outstanding_extents for an inode. + * + * @inode: the inode to balance the reservation for. + * @num_bytes: the number of bytes we originally reserved with + * + * When we reserve space we increase outstanding_extents for the extents we may + * add. Once we've set the range as delalloc or created our ordered extents we + * have outstanding_extents to track the real usage, so we use this to free our + * temporarily tracked outstanding_extents. This _must_ be used in conjunction + * with btrfs_delalloc_reserve_metadata. + */ +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + unsigned num_extents; + + spin_lock(&inode->lock); + num_extents = count_max_extents(fs_info, num_bytes); + btrfs_mod_outstanding_extents(inode, -num_extents); + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + if (btrfs_is_testing(fs_info)) + return; + + btrfs_inode_rsv_release(inode, true); +} + +/* + * Reserve data and metadata space for delalloc + * + * @inode: inode we're writing to + * @start: start range we are writing to + * @len: how long the range we are writing to + * @reserved: mandatory parameter, record actually reserved qgroup ranges of + * current reservation. + * + * This will do the following things + * + * - reserve space in data space info for num bytes and reserve precious + * corresponding qgroup space + * (Done in check_data_free_space) + * + * - reserve space for metadata space, based on the number of outstanding + * extents and how much csums will be needed also reserve metadata space in a + * per root over-reserve method. + * - add to the inodes->delalloc_bytes + * - add it to the fs_info's delalloc inodes list. + * (Above 3 all done in delalloc_reserve_metadata) + * + * Return 0 for success + * Return <0 for error(-ENOSPC or -EDQUOT) + */ +int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) +{ + int ret; + + ret = btrfs_check_data_free_space(inode, reserved, start, len, false); + if (ret < 0) + return ret; + ret = btrfs_delalloc_reserve_metadata(inode, len, len, false); + if (ret < 0) { + btrfs_free_reserved_data_space(inode, *reserved, start, len); + extent_changeset_free(*reserved); + *reserved = NULL; + } + return ret; +} + +/* + * Release data and metadata space for delalloc + * + * @inode: inode we're releasing space for + * @reserved: list of changed/reserved ranges + * @start: start position of the space already reserved + * @len: length of the space already reserved + * @qgroup_free: should qgroup reserved-space also be freed + * + * Release the metadata space that was not used and will decrement + * ->delalloc_bytes and remove it from the fs_info->delalloc_inodes list if + * there are no delalloc bytes left. Also it will handle the qgroup reserved + * space. + */ +void btrfs_delalloc_release_space(struct btrfs_inode *inode, + struct extent_changeset *reserved, + u64 start, u64 len, bool qgroup_free) +{ + btrfs_delalloc_release_metadata(inode, len, qgroup_free); + btrfs_free_reserved_data_space(inode, reserved, start, len); +} diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h new file mode 100644 index 0000000000..c5d573f236 --- /dev/null +++ b/fs/btrfs/delalloc-space.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DELALLOC_SPACE_H +#define BTRFS_DELALLOC_SPACE_H + +struct extent_changeset; + +int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct btrfs_inode *inode, + struct extent_changeset **reserved, u64 start, u64 len, + bool noflush); +void btrfs_free_reserved_data_space(struct btrfs_inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); +void btrfs_delalloc_release_space(struct btrfs_inode *inode, + struct extent_changeset *reserved, + u64 start, u64 len, bool qgroup_free); +void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, + u64 len); +void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free); +int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + u64 disk_num_bytes, bool noflush); +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); + +#endif /* BTRFS_DELALLOC_SPACE_H */ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c new file mode 100644 index 0000000000..16f9e5f474 --- /dev/null +++ b/fs/btrfs/delayed-inode.c @@ -0,0 +1,2200 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2011 Fujitsu. All rights reserved. + * Written by Miao Xie + */ + +#include +#include +#include "ctree.h" +#include "fs.h" +#include "messages.h" +#include "misc.h" +#include "delayed-inode.h" +#include "disk-io.h" +#include "transaction.h" +#include "qgroup.h" +#include "locking.h" +#include "inode-item.h" +#include "space-info.h" +#include "accessors.h" +#include "file-item.h" + +#define BTRFS_DELAYED_WRITEBACK 512 +#define BTRFS_DELAYED_BACKGROUND 128 +#define BTRFS_DELAYED_BATCH 16 + +static struct kmem_cache *delayed_node_cache; + +int __init btrfs_delayed_inode_init(void) +{ + delayed_node_cache = kmem_cache_create("btrfs_delayed_node", + sizeof(struct btrfs_delayed_node), + 0, + SLAB_MEM_SPREAD, + NULL); + if (!delayed_node_cache) + return -ENOMEM; + return 0; +} + +void __cold btrfs_delayed_inode_exit(void) +{ + kmem_cache_destroy(delayed_node_cache); +} + +static inline void btrfs_init_delayed_node( + struct btrfs_delayed_node *delayed_node, + struct btrfs_root *root, u64 inode_id) +{ + delayed_node->root = root; + delayed_node->inode_id = inode_id; + refcount_set(&delayed_node->refs, 0); + delayed_node->ins_root = RB_ROOT_CACHED; + delayed_node->del_root = RB_ROOT_CACHED; + mutex_init(&delayed_node->mutex); + INIT_LIST_HEAD(&delayed_node->n_list); + INIT_LIST_HEAD(&delayed_node->p_list); +} + +static struct btrfs_delayed_node *btrfs_get_delayed_node( + struct btrfs_inode *btrfs_inode) +{ + struct btrfs_root *root = btrfs_inode->root; + u64 ino = btrfs_ino(btrfs_inode); + struct btrfs_delayed_node *node; + + node = READ_ONCE(btrfs_inode->delayed_node); + if (node) { + refcount_inc(&node->refs); + return node; + } + + spin_lock(&root->inode_lock); + node = radix_tree_lookup(&root->delayed_nodes_tree, ino); + + if (node) { + if (btrfs_inode->delayed_node) { + refcount_inc(&node->refs); /* can be accessed */ + BUG_ON(btrfs_inode->delayed_node != node); + spin_unlock(&root->inode_lock); + return node; + } + + /* + * It's possible that we're racing into the middle of removing + * this node from the radix tree. In this case, the refcount + * was zero and it should never go back to one. Just return + * NULL like it was never in the radix at all; our release + * function is in the process of removing it. + * + * Some implementations of refcount_inc refuse to bump the + * refcount once it has hit zero. If we don't do this dance + * here, refcount_inc() may decide to just WARN_ONCE() instead + * of actually bumping the refcount. + * + * If this node is properly in the radix, we want to bump the + * refcount twice, once for the inode and once for this get + * operation. + */ + if (refcount_inc_not_zero(&node->refs)) { + refcount_inc(&node->refs); + btrfs_inode->delayed_node = node; + } else { + node = NULL; + } + + spin_unlock(&root->inode_lock); + return node; + } + spin_unlock(&root->inode_lock); + + return NULL; +} + +/* Will return either the node or PTR_ERR(-ENOMEM) */ +static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( + struct btrfs_inode *btrfs_inode) +{ + struct btrfs_delayed_node *node; + struct btrfs_root *root = btrfs_inode->root; + u64 ino = btrfs_ino(btrfs_inode); + int ret; + +again: + node = btrfs_get_delayed_node(btrfs_inode); + if (node) + return node; + + node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); + if (!node) + return ERR_PTR(-ENOMEM); + btrfs_init_delayed_node(node, root, ino); + + /* cached in the btrfs inode and can be accessed */ + refcount_set(&node->refs, 2); + + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + kmem_cache_free(delayed_node_cache, node); + return ERR_PTR(ret); + } + + spin_lock(&root->inode_lock); + ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); + if (ret == -EEXIST) { + spin_unlock(&root->inode_lock); + kmem_cache_free(delayed_node_cache, node); + radix_tree_preload_end(); + goto again; + } + btrfs_inode->delayed_node = node; + spin_unlock(&root->inode_lock); + radix_tree_preload_end(); + + return node; +} + +/* + * Call it when holding delayed_node->mutex + * + * If mod = 1, add this node into the prepared list. + */ +static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, + struct btrfs_delayed_node *node, + int mod) +{ + spin_lock(&root->lock); + if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { + if (!list_empty(&node->p_list)) + list_move_tail(&node->p_list, &root->prepare_list); + else if (mod) + list_add_tail(&node->p_list, &root->prepare_list); + } else { + list_add_tail(&node->n_list, &root->node_list); + list_add_tail(&node->p_list, &root->prepare_list); + refcount_inc(&node->refs); /* inserted into list */ + root->nodes++; + set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); + } + spin_unlock(&root->lock); +} + +/* Call it when holding delayed_node->mutex */ +static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, + struct btrfs_delayed_node *node) +{ + spin_lock(&root->lock); + if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { + root->nodes--; + refcount_dec(&node->refs); /* not in the list */ + list_del_init(&node->n_list); + if (!list_empty(&node->p_list)) + list_del_init(&node->p_list); + clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); + } + spin_unlock(&root->lock); +} + +static struct btrfs_delayed_node *btrfs_first_delayed_node( + struct btrfs_delayed_root *delayed_root) +{ + struct list_head *p; + struct btrfs_delayed_node *node = NULL; + + spin_lock(&delayed_root->lock); + if (list_empty(&delayed_root->node_list)) + goto out; + + p = delayed_root->node_list.next; + node = list_entry(p, struct btrfs_delayed_node, n_list); + refcount_inc(&node->refs); +out: + spin_unlock(&delayed_root->lock); + + return node; +} + +static struct btrfs_delayed_node *btrfs_next_delayed_node( + struct btrfs_delayed_node *node) +{ + struct btrfs_delayed_root *delayed_root; + struct list_head *p; + struct btrfs_delayed_node *next = NULL; + + delayed_root = node->root->fs_info->delayed_root; + spin_lock(&delayed_root->lock); + if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { + /* not in the list */ + if (list_empty(&delayed_root->node_list)) + goto out; + p = delayed_root->node_list.next; + } else if (list_is_last(&node->n_list, &delayed_root->node_list)) + goto out; + else + p = node->n_list.next; + + next = list_entry(p, struct btrfs_delayed_node, n_list); + refcount_inc(&next->refs); +out: + spin_unlock(&delayed_root->lock); + + return next; +} + +static void __btrfs_release_delayed_node( + struct btrfs_delayed_node *delayed_node, + int mod) +{ + struct btrfs_delayed_root *delayed_root; + + if (!delayed_node) + return; + + delayed_root = delayed_node->root->fs_info->delayed_root; + + mutex_lock(&delayed_node->mutex); + if (delayed_node->count) + btrfs_queue_delayed_node(delayed_root, delayed_node, mod); + else + btrfs_dequeue_delayed_node(delayed_root, delayed_node); + mutex_unlock(&delayed_node->mutex); + + if (refcount_dec_and_test(&delayed_node->refs)) { + struct btrfs_root *root = delayed_node->root; + + spin_lock(&root->inode_lock); + /* + * Once our refcount goes to zero, nobody is allowed to bump it + * back up. We can delete it now. + */ + ASSERT(refcount_read(&delayed_node->refs) == 0); + radix_tree_delete(&root->delayed_nodes_tree, + delayed_node->inode_id); + spin_unlock(&root->inode_lock); + kmem_cache_free(delayed_node_cache, delayed_node); + } +} + +static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node) +{ + __btrfs_release_delayed_node(node, 0); +} + +static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( + struct btrfs_delayed_root *delayed_root) +{ + struct list_head *p; + struct btrfs_delayed_node *node = NULL; + + spin_lock(&delayed_root->lock); + if (list_empty(&delayed_root->prepare_list)) + goto out; + + p = delayed_root->prepare_list.next; + list_del_init(p); + node = list_entry(p, struct btrfs_delayed_node, p_list); + refcount_inc(&node->refs); +out: + spin_unlock(&delayed_root->lock); + + return node; +} + +static inline void btrfs_release_prepared_delayed_node( + struct btrfs_delayed_node *node) +{ + __btrfs_release_delayed_node(node, 1); +} + +static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, + struct btrfs_delayed_node *node, + enum btrfs_delayed_item_type type) +{ + struct btrfs_delayed_item *item; + + item = kmalloc(struct_size(item, data, data_len), GFP_NOFS); + if (item) { + item->data_len = data_len; + item->type = type; + item->bytes_reserved = 0; + item->delayed_node = node; + RB_CLEAR_NODE(&item->rb_node); + INIT_LIST_HEAD(&item->log_list); + item->logged = false; + refcount_set(&item->refs, 1); + } + return item; +} + +/* + * __btrfs_lookup_delayed_item - look up the delayed item by key + * @delayed_node: pointer to the delayed node + * @index: the dir index value to lookup (offset of a dir index key) + * + * Note: if we don't find the right item, we will return the prev item and + * the next item. + */ +static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( + struct rb_root *root, + u64 index) +{ + struct rb_node *node = root->rb_node; + struct btrfs_delayed_item *delayed_item = NULL; + + while (node) { + delayed_item = rb_entry(node, struct btrfs_delayed_item, + rb_node); + if (delayed_item->index < index) + node = node->rb_right; + else if (delayed_item->index > index) + node = node->rb_left; + else + return delayed_item; + } + + return NULL; +} + +static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, + struct btrfs_delayed_item *ins) +{ + struct rb_node **p, *node; + struct rb_node *parent_node = NULL; + struct rb_root_cached *root; + struct btrfs_delayed_item *item; + bool leftmost = true; + + if (ins->type == BTRFS_DELAYED_INSERTION_ITEM) + root = &delayed_node->ins_root; + else + root = &delayed_node->del_root; + + p = &root->rb_root.rb_node; + node = &ins->rb_node; + + while (*p) { + parent_node = *p; + item = rb_entry(parent_node, struct btrfs_delayed_item, + rb_node); + + if (item->index < ins->index) { + p = &(*p)->rb_right; + leftmost = false; + } else if (item->index > ins->index) { + p = &(*p)->rb_left; + } else { + return -EEXIST; + } + } + + rb_link_node(node, parent_node, p); + rb_insert_color_cached(node, root, leftmost); + + if (ins->type == BTRFS_DELAYED_INSERTION_ITEM && + ins->index >= delayed_node->index_cnt) + delayed_node->index_cnt = ins->index + 1; + + delayed_node->count++; + atomic_inc(&delayed_node->root->fs_info->delayed_root->items); + return 0; +} + +static void finish_one_item(struct btrfs_delayed_root *delayed_root) +{ + int seq = atomic_inc_return(&delayed_root->items_seq); + + /* atomic_dec_return implies a barrier */ + if ((atomic_dec_return(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0)) + cond_wake_up_nomb(&delayed_root->wait); +} + +static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) +{ + struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node; + struct rb_root_cached *root; + struct btrfs_delayed_root *delayed_root; + + /* Not inserted, ignore it. */ + if (RB_EMPTY_NODE(&delayed_item->rb_node)) + return; + + /* If it's in a rbtree, then we need to have delayed node locked. */ + lockdep_assert_held(&delayed_node->mutex); + + delayed_root = delayed_node->root->fs_info->delayed_root; + + BUG_ON(!delayed_root); + + if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) + root = &delayed_node->ins_root; + else + root = &delayed_node->del_root; + + rb_erase_cached(&delayed_item->rb_node, root); + RB_CLEAR_NODE(&delayed_item->rb_node); + delayed_node->count--; + + finish_one_item(delayed_root); +} + +static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) +{ + if (item) { + __btrfs_remove_delayed_item(item); + if (refcount_dec_and_test(&item->refs)) + kfree(item); + } +} + +static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( + struct btrfs_delayed_node *delayed_node) +{ + struct rb_node *p; + struct btrfs_delayed_item *item = NULL; + + p = rb_first_cached(&delayed_node->ins_root); + if (p) + item = rb_entry(p, struct btrfs_delayed_item, rb_node); + + return item; +} + +static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( + struct btrfs_delayed_node *delayed_node) +{ + struct rb_node *p; + struct btrfs_delayed_item *item = NULL; + + p = rb_first_cached(&delayed_node->del_root); + if (p) + item = rb_entry(p, struct btrfs_delayed_item, rb_node); + + return item; +} + +static struct btrfs_delayed_item *__btrfs_next_delayed_item( + struct btrfs_delayed_item *item) +{ + struct rb_node *p; + struct btrfs_delayed_item *next = NULL; + + p = rb_next(&item->rb_node); + if (p) + next = rb_entry(p, struct btrfs_delayed_item, rb_node); + + return next; +} + +static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_delayed_item *item) +{ + struct btrfs_block_rsv *src_rsv; + struct btrfs_block_rsv *dst_rsv; + struct btrfs_fs_info *fs_info = trans->fs_info; + u64 num_bytes; + int ret; + + if (!trans->bytes_reserved) + return 0; + + src_rsv = trans->block_rsv; + dst_rsv = &fs_info->delayed_block_rsv; + + num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + + /* + * Here we migrate space rsv from transaction rsv, since have already + * reserved space when starting a transaction. So no need to reserve + * qgroup space here. + */ + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); + if (!ret) { + trace_btrfs_space_reservation(fs_info, "delayed_item", + item->delayed_node->inode_id, + num_bytes, 1); + /* + * For insertions we track reserved metadata space by accounting + * for the number of leaves that will be used, based on the delayed + * node's index_items_size field. + */ + if (item->type == BTRFS_DELAYED_DELETION_ITEM) + item->bytes_reserved = num_bytes; + } + + return ret; +} + +static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, + struct btrfs_delayed_item *item) +{ + struct btrfs_block_rsv *rsv; + struct btrfs_fs_info *fs_info = root->fs_info; + + if (!item->bytes_reserved) + return; + + rsv = &fs_info->delayed_block_rsv; + /* + * Check btrfs_delayed_item_reserve_metadata() to see why we don't need + * to release/reserve qgroup space. + */ + trace_btrfs_space_reservation(fs_info, "delayed_item", + item->delayed_node->inode_id, + item->bytes_reserved, 0); + btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL); +} + +static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node, + unsigned int num_leaves) +{ + struct btrfs_fs_info *fs_info = node->root->fs_info; + const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves); + + /* There are no space reservations during log replay, bail out. */ + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + return; + + trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id, + bytes, 0); + btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL); +} + +static int btrfs_delayed_inode_reserve_metadata( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_node *node) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *src_rsv; + struct btrfs_block_rsv *dst_rsv; + u64 num_bytes; + int ret; + + src_rsv = trans->block_rsv; + dst_rsv = &fs_info->delayed_block_rsv; + + num_bytes = btrfs_calc_metadata_size(fs_info, 1); + + /* + * btrfs_dirty_inode will update the inode under btrfs_join_transaction + * which doesn't reserve space for speed. This is a problem since we + * still need to reserve space for this update, so try to reserve the + * space. + * + * Now if src_rsv == delalloc_block_rsv we'll let it just steal since + * we always reserve enough to update the inode item. + */ + if (!src_rsv || (!trans->bytes_reserved && + src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { + ret = btrfs_qgroup_reserve_meta(root, num_bytes, + BTRFS_QGROUP_RSV_META_PREALLOC, true); + if (ret < 0) + return ret; + ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes, + BTRFS_RESERVE_NO_FLUSH); + /* NO_FLUSH could only fail with -ENOSPC */ + ASSERT(ret == 0 || ret == -ENOSPC); + if (ret) + btrfs_qgroup_free_meta_prealloc(root, num_bytes); + } else { + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); + } + + if (!ret) { + trace_btrfs_space_reservation(fs_info, "delayed_inode", + node->inode_id, num_bytes, 1); + node->bytes_reserved = num_bytes; + } + + return ret; +} + +static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_node *node, + bool qgroup_free) +{ + struct btrfs_block_rsv *rsv; + + if (!node->bytes_reserved) + return; + + rsv = &fs_info->delayed_block_rsv; + trace_btrfs_space_reservation(fs_info, "delayed_inode", + node->inode_id, node->bytes_reserved, 0); + btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL); + if (qgroup_free) + btrfs_qgroup_free_meta_prealloc(node->root, + node->bytes_reserved); + else + btrfs_qgroup_convert_reserved_meta(node->root, + node->bytes_reserved); + node->bytes_reserved = 0; +} + +/* + * Insert a single delayed item or a batch of delayed items, as many as possible + * that fit in a leaf. The delayed items (dir index keys) are sorted by their key + * in the rbtree, and if there's a gap between two consecutive dir index items, + * then it means at some point we had delayed dir indexes to add but they got + * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them + * into the subvolume tree. Dir index keys also have their offsets coming from a + * monotonically increasing counter, so we can't get new keys with an offset that + * fits within a gap between delayed dir index items. + */ +static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_item *first_item) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_delayed_node *node = first_item->delayed_node; + LIST_HEAD(item_list); + struct btrfs_delayed_item *curr; + struct btrfs_delayed_item *next; + const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info); + struct btrfs_item_batch batch; + struct btrfs_key first_key; + const u32 first_data_size = first_item->data_len; + int total_size; + char *ins_data = NULL; + int ret; + bool continuous_keys_only = false; + + lockdep_assert_held(&node->mutex); + + /* + * During normal operation the delayed index offset is continuously + * increasing, so we can batch insert all items as there will not be any + * overlapping keys in the tree. + * + * The exception to this is log replay, where we may have interleaved + * offsets in the tree, so our batch needs to be continuous keys only in + * order to ensure we do not end up with out of order items in our leaf. + */ + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + continuous_keys_only = true; + + /* + * For delayed items to insert, we track reserved metadata bytes based + * on the number of leaves that we will use. + * See btrfs_insert_delayed_dir_index() and + * btrfs_delayed_item_reserve_metadata()). + */ + ASSERT(first_item->bytes_reserved == 0); + + list_add_tail(&first_item->tree_list, &item_list); + batch.total_data_size = first_data_size; + batch.nr = 1; + total_size = first_data_size + sizeof(struct btrfs_item); + curr = first_item; + + while (true) { + int next_size; + + next = __btrfs_next_delayed_item(curr); + if (!next) + break; + + /* + * We cannot allow gaps in the key space if we're doing log + * replay. + */ + if (continuous_keys_only && (next->index != curr->index + 1)) + break; + + ASSERT(next->bytes_reserved == 0); + + next_size = next->data_len + sizeof(struct btrfs_item); + if (total_size + next_size > max_size) + break; + + list_add_tail(&next->tree_list, &item_list); + batch.nr++; + total_size += next_size; + batch.total_data_size += next->data_len; + curr = next; + } + + if (batch.nr == 1) { + first_key.objectid = node->inode_id; + first_key.type = BTRFS_DIR_INDEX_KEY; + first_key.offset = first_item->index; + batch.keys = &first_key; + batch.data_sizes = &first_data_size; + } else { + struct btrfs_key *ins_keys; + u32 *ins_sizes; + int i = 0; + + ins_data = kmalloc(batch.nr * sizeof(u32) + + batch.nr * sizeof(struct btrfs_key), GFP_NOFS); + if (!ins_data) { + ret = -ENOMEM; + goto out; + } + ins_sizes = (u32 *)ins_data; + ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32)); + batch.keys = ins_keys; + batch.data_sizes = ins_sizes; + list_for_each_entry(curr, &item_list, tree_list) { + ins_keys[i].objectid = node->inode_id; + ins_keys[i].type = BTRFS_DIR_INDEX_KEY; + ins_keys[i].offset = curr->index; + ins_sizes[i] = curr->data_len; + i++; + } + } + + ret = btrfs_insert_empty_items(trans, root, path, &batch); + if (ret) + goto out; + + list_for_each_entry(curr, &item_list, tree_list) { + char *data_ptr; + + data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); + write_extent_buffer(path->nodes[0], &curr->data, + (unsigned long)data_ptr, curr->data_len); + path->slots[0]++; + } + + /* + * Now release our path before releasing the delayed items and their + * metadata reservations, so that we don't block other tasks for more + * time than needed. + */ + btrfs_release_path(path); + + ASSERT(node->index_item_leaves > 0); + + /* + * For normal operations we will batch an entire leaf's worth of delayed + * items, so if there are more items to process we can decrement + * index_item_leaves by 1 as we inserted 1 leaf's worth of items. + * + * However for log replay we may not have inserted an entire leaf's + * worth of items, we may have not had continuous items, so decrementing + * here would mess up the index_item_leaves accounting. For this case + * only clean up the accounting when there are no items left. + */ + if (next && !continuous_keys_only) { + /* + * We inserted one batch of items into a leaf a there are more + * items to flush in a future batch, now release one unit of + * metadata space from the delayed block reserve, corresponding + * the leaf we just flushed to. + */ + btrfs_delayed_item_release_leaves(node, 1); + node->index_item_leaves--; + } else if (!next) { + /* + * There are no more items to insert. We can have a number of + * reserved leaves > 1 here - this happens when many dir index + * items are added and then removed before they are flushed (file + * names with a very short life, never span a transaction). So + * release all remaining leaves. + */ + btrfs_delayed_item_release_leaves(node, node->index_item_leaves); + node->index_item_leaves = 0; + } + + list_for_each_entry_safe(curr, next, &item_list, tree_list) { + list_del(&curr->tree_list); + btrfs_release_delayed_item(curr); + } +out: + kfree(ins_data); + return ret; +} + +static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_root *root, + struct btrfs_delayed_node *node) +{ + int ret = 0; + + while (ret == 0) { + struct btrfs_delayed_item *curr; + + mutex_lock(&node->mutex); + curr = __btrfs_first_delayed_insertion_item(node); + if (!curr) { + mutex_unlock(&node->mutex); + break; + } + ret = btrfs_insert_delayed_item(trans, root, path, curr); + mutex_unlock(&node->mutex); + } + + return ret; +} + +static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_item *item) +{ + const u64 ino = item->delayed_node->inode_id; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_delayed_item *curr, *next; + struct extent_buffer *leaf = path->nodes[0]; + LIST_HEAD(batch_list); + int nitems, slot, last_slot; + int ret; + u64 total_reserved_size = item->bytes_reserved; + + ASSERT(leaf != NULL); + + slot = path->slots[0]; + last_slot = btrfs_header_nritems(leaf) - 1; + /* + * Our caller always gives us a path pointing to an existing item, so + * this can not happen. + */ + ASSERT(slot <= last_slot); + if (WARN_ON(slot > last_slot)) + return -ENOENT; + + nitems = 1; + curr = item; + list_add_tail(&curr->tree_list, &batch_list); + + /* + * Keep checking if the next delayed item matches the next item in the + * leaf - if so, we can add it to the batch of items to delete from the + * leaf. + */ + while (slot < last_slot) { + struct btrfs_key key; + + next = __btrfs_next_delayed_item(curr); + if (!next) + break; + + slot++; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || + key.type != BTRFS_DIR_INDEX_KEY || + key.offset != next->index) + break; + nitems++; + curr = next; + list_add_tail(&curr->tree_list, &batch_list); + total_reserved_size += curr->bytes_reserved; + } + + ret = btrfs_del_items(trans, root, path, path->slots[0], nitems); + if (ret) + return ret; + + /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */ + if (total_reserved_size > 0) { + /* + * Check btrfs_delayed_item_reserve_metadata() to see why we + * don't need to release/reserve qgroup space. + */ + trace_btrfs_space_reservation(fs_info, "delayed_item", ino, + total_reserved_size, 0); + btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, + total_reserved_size, NULL); + } + + list_for_each_entry_safe(curr, next, &batch_list, tree_list) { + list_del(&curr->tree_list); + btrfs_release_delayed_item(curr); + } + + return 0; +} + +static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_root *root, + struct btrfs_delayed_node *node) +{ + struct btrfs_key key; + int ret = 0; + + key.objectid = node->inode_id; + key.type = BTRFS_DIR_INDEX_KEY; + + while (ret == 0) { + struct btrfs_delayed_item *item; + + mutex_lock(&node->mutex); + item = __btrfs_first_delayed_deletion_item(node); + if (!item) { + mutex_unlock(&node->mutex); + break; + } + + key.offset = item->index; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + /* + * There's no matching item in the leaf. This means we + * have already deleted this item in a past run of the + * delayed items. We ignore errors when running delayed + * items from an async context, through a work queue job + * running btrfs_async_run_delayed_root(), and don't + * release delayed items that failed to complete. This + * is because we will retry later, and at transaction + * commit time we always run delayed items and will + * then deal with errors if they fail to run again. + * + * So just release delayed items for which we can't find + * an item in the tree, and move to the next item. + */ + btrfs_release_path(path); + btrfs_release_delayed_item(item); + ret = 0; + } else if (ret == 0) { + ret = btrfs_batch_delete_items(trans, root, path, item); + btrfs_release_path(path); + } + + /* + * We unlock and relock on each iteration, this is to prevent + * blocking other tasks for too long while we are being run from + * the async context (work queue job). Those tasks are typically + * running system calls like creat/mkdir/rename/unlink/etc which + * need to add delayed items to this delayed node. + */ + mutex_unlock(&node->mutex); + } + + return ret; +} + +static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) +{ + struct btrfs_delayed_root *delayed_root; + + if (delayed_node && + test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { + BUG_ON(!delayed_node->root); + clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); + delayed_node->count--; + + delayed_root = delayed_node->root->fs_info->delayed_root; + finish_one_item(delayed_root); + } +} + +static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node) +{ + + if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { + struct btrfs_delayed_root *delayed_root; + + ASSERT(delayed_node->root); + delayed_node->count--; + + delayed_root = delayed_node->root->fs_info->delayed_root; + finish_one_item(delayed_root); + } +} + +static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_node *node) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key key; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + int mod; + int ret; + + key.objectid = node->inode_id; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) + mod = -1; + else + mod = 1; + + ret = btrfs_lookup_inode(trans, root, path, &key, mod); + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, + sizeof(struct btrfs_inode_item)); + btrfs_mark_buffer_dirty(trans, leaf); + + if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) + goto out; + + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(leaf)) + goto search; +again: + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != node->inode_id) + goto out; + + if (key.type != BTRFS_INODE_REF_KEY && + key.type != BTRFS_INODE_EXTREF_KEY) + goto out; + + /* + * Delayed iref deletion is for the inode who has only one link, + * so there is only one iref. The case that several irefs are + * in the same item doesn't exist. + */ + ret = btrfs_del_item(trans, root, path); +out: + btrfs_release_delayed_iref(node); + btrfs_release_path(path); +err_out: + btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0)); + btrfs_release_delayed_inode(node); + + /* + * If we fail to update the delayed inode we need to abort the + * transaction, because we could leave the inode with the improper + * counts behind. + */ + if (ret && ret != -ENOENT) + btrfs_abort_transaction(trans, ret); + + return ret; + +search: + btrfs_release_path(path); + + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = -1; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto err_out; + ASSERT(ret); + + ret = 0; + leaf = path->nodes[0]; + path->slots[0]--; + goto again; +} + +static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_node *node) +{ + int ret; + + mutex_lock(&node->mutex); + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) { + mutex_unlock(&node->mutex); + return 0; + } + + ret = __btrfs_update_delayed_inode(trans, root, path, node); + mutex_unlock(&node->mutex); + return ret; +} + +static inline int +__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_delayed_node *node) +{ + int ret; + + ret = btrfs_insert_delayed_items(trans, path, node->root, node); + if (ret) + return ret; + + ret = btrfs_delete_delayed_items(trans, path, node->root, node); + if (ret) + return ret; + + ret = btrfs_update_delayed_inode(trans, node->root, path, node); + return ret; +} + +/* + * Called when committing the transaction. + * Returns 0 on success. + * Returns < 0 on error and returns with an aborted transaction with any + * outstanding delayed items cleaned up. + */ +static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_root *delayed_root; + struct btrfs_delayed_node *curr_node, *prev_node; + struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; + int ret = 0; + bool count = (nr > 0); + + if (TRANS_ABORTED(trans)) + return -EIO; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + block_rsv = trans->block_rsv; + trans->block_rsv = &fs_info->delayed_block_rsv; + + delayed_root = fs_info->delayed_root; + + curr_node = btrfs_first_delayed_node(delayed_root); + while (curr_node && (!count || nr--)) { + ret = __btrfs_commit_inode_delayed_items(trans, path, + curr_node); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + + prev_node = curr_node; + curr_node = btrfs_next_delayed_node(curr_node); + /* + * See the comment below about releasing path before releasing + * node. If the commit of delayed items was successful the path + * should always be released, but in case of an error, it may + * point to locked extent buffers (a leaf at the very least). + */ + ASSERT(path->nodes[0] == NULL); + btrfs_release_delayed_node(prev_node); + } + + /* + * Release the path to avoid a potential deadlock and lockdep splat when + * releasing the delayed node, as that requires taking the delayed node's + * mutex. If another task starts running delayed items before we take + * the mutex, it will first lock the mutex and then it may try to lock + * the same btree path (leaf). + */ + btrfs_free_path(path); + + if (curr_node) + btrfs_release_delayed_node(curr_node); + trans->block_rsv = block_rsv; + + return ret; +} + +int btrfs_run_delayed_items(struct btrfs_trans_handle *trans) +{ + return __btrfs_run_delayed_items(trans, -1); +} + +int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr) +{ + return __btrfs_run_delayed_items(trans, nr); +} + +int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) +{ + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; + int ret; + + if (!delayed_node) + return 0; + + mutex_lock(&delayed_node->mutex); + if (!delayed_node->count) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; + } + mutex_unlock(&delayed_node->mutex); + + path = btrfs_alloc_path(); + if (!path) { + btrfs_release_delayed_node(delayed_node); + return -ENOMEM; + } + + block_rsv = trans->block_rsv; + trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; + + ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); + + btrfs_release_delayed_node(delayed_node); + btrfs_free_path(path); + trans->block_rsv = block_rsv; + + return ret; +} + +int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_trans_handle *trans; + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; + int ret; + + if (!delayed_node) + return 0; + + mutex_lock(&delayed_node->mutex); + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; + } + mutex_unlock(&delayed_node->mutex); + + trans = btrfs_join_transaction(delayed_node->root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto trans_out; + } + + block_rsv = trans->block_rsv; + trans->block_rsv = &fs_info->delayed_block_rsv; + + mutex_lock(&delayed_node->mutex); + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) + ret = __btrfs_update_delayed_inode(trans, delayed_node->root, + path, delayed_node); + else + ret = 0; + mutex_unlock(&delayed_node->mutex); + + btrfs_free_path(path); + trans->block_rsv = block_rsv; +trans_out: + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); +out: + btrfs_release_delayed_node(delayed_node); + + return ret; +} + +void btrfs_remove_delayed_node(struct btrfs_inode *inode) +{ + struct btrfs_delayed_node *delayed_node; + + delayed_node = READ_ONCE(inode->delayed_node); + if (!delayed_node) + return; + + inode->delayed_node = NULL; + btrfs_release_delayed_node(delayed_node); +} + +struct btrfs_async_delayed_work { + struct btrfs_delayed_root *delayed_root; + int nr; + struct btrfs_work work; +}; + +static void btrfs_async_run_delayed_root(struct btrfs_work *work) +{ + struct btrfs_async_delayed_work *async_work; + struct btrfs_delayed_root *delayed_root; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_delayed_node *delayed_node = NULL; + struct btrfs_root *root; + struct btrfs_block_rsv *block_rsv; + int total_done = 0; + + async_work = container_of(work, struct btrfs_async_delayed_work, work); + delayed_root = async_work->delayed_root; + + path = btrfs_alloc_path(); + if (!path) + goto out; + + do { + if (atomic_read(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND / 2) + break; + + delayed_node = btrfs_first_prepared_delayed_node(delayed_root); + if (!delayed_node) + break; + + root = delayed_node->root; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_release_path(path); + btrfs_release_prepared_delayed_node(delayed_node); + total_done++; + continue; + } + + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; + + __btrfs_commit_inode_delayed_items(trans, path, delayed_node); + + trans->block_rsv = block_rsv; + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty_nodelay(root->fs_info); + + btrfs_release_path(path); + btrfs_release_prepared_delayed_node(delayed_node); + total_done++; + + } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) + || total_done < async_work->nr); + + btrfs_free_path(path); +out: + wake_up(&delayed_root->wait); + kfree(async_work); +} + + +static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, + struct btrfs_fs_info *fs_info, int nr) +{ + struct btrfs_async_delayed_work *async_work; + + async_work = kmalloc(sizeof(*async_work), GFP_NOFS); + if (!async_work) + return -ENOMEM; + + async_work->delayed_root = delayed_root; + btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL, + NULL); + async_work->nr = nr; + + btrfs_queue_work(fs_info->delayed_workers, &async_work->work); + return 0; +} + +void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) +{ + WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root)); +} + +static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) +{ + int val = atomic_read(&delayed_root->items_seq); + + if (val < seq || val >= seq + BTRFS_DELAYED_BATCH) + return 1; + + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) + return 1; + + return 0; +} + +void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) +{ + struct btrfs_delayed_root *delayed_root = fs_info->delayed_root; + + if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) || + btrfs_workqueue_normal_congested(fs_info->delayed_workers)) + return; + + if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { + int seq; + int ret; + + seq = atomic_read(&delayed_root->items_seq); + + ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0); + if (ret) + return; + + wait_event_interruptible(delayed_root->wait, + could_end_wait(delayed_root, seq)); + return; + } + + btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH); +} + +static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + return; + + /* + * Adding the new dir index item does not require touching another + * leaf, so we can release 1 unit of metadata that was previously + * reserved when starting the transaction. This applies only to + * the case where we had a transaction start and excludes the + * transaction join case (when replaying log trees). + */ + trace_btrfs_space_reservation(fs_info, "transaction", + trans->transid, bytes, 0); + btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); + ASSERT(trans->bytes_reserved >= bytes); + trans->bytes_reserved -= bytes; +} + +/* Will return 0, -ENOMEM or -EEXIST (index number collision, unexpected). */ +int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, + const char *name, int name_len, + struct btrfs_inode *dir, + struct btrfs_disk_key *disk_key, u8 flags, + u64 index) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info); + struct btrfs_delayed_node *delayed_node; + struct btrfs_delayed_item *delayed_item; + struct btrfs_dir_item *dir_item; + bool reserve_leaf_space; + u32 data_len; + int ret; + + delayed_node = btrfs_get_or_create_delayed_node(dir); + if (IS_ERR(delayed_node)) + return PTR_ERR(delayed_node); + + delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len, + delayed_node, + BTRFS_DELAYED_INSERTION_ITEM); + if (!delayed_item) { + ret = -ENOMEM; + goto release_node; + } + + delayed_item->index = index; + + dir_item = (struct btrfs_dir_item *)delayed_item->data; + dir_item->location = *disk_key; + btrfs_set_stack_dir_transid(dir_item, trans->transid); + btrfs_set_stack_dir_data_len(dir_item, 0); + btrfs_set_stack_dir_name_len(dir_item, name_len); + btrfs_set_stack_dir_flags(dir_item, flags); + memcpy((char *)(dir_item + 1), name, name_len); + + data_len = delayed_item->data_len + sizeof(struct btrfs_item); + + mutex_lock(&delayed_node->mutex); + + /* + * First attempt to insert the delayed item. This is to make the error + * handling path simpler in case we fail (-EEXIST). There's no risk of + * any other task coming in and running the delayed item before we do + * the metadata space reservation below, because we are holding the + * delayed node's mutex and that mutex must also be locked before the + * node's delayed items can be run. + */ + ret = __btrfs_add_delayed_item(delayed_node, delayed_item); + if (unlikely(ret)) { + btrfs_err(trans->fs_info, +"error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d", + name_len, name, index, btrfs_root_id(delayed_node->root), + delayed_node->inode_id, dir->index_cnt, + delayed_node->index_cnt, ret); + btrfs_release_delayed_item(delayed_item); + btrfs_release_dir_index_item_space(trans); + mutex_unlock(&delayed_node->mutex); + goto release_node; + } + + if (delayed_node->index_item_leaves == 0 || + delayed_node->curr_index_batch_size + data_len > leaf_data_size) { + delayed_node->curr_index_batch_size = data_len; + reserve_leaf_space = true; + } else { + delayed_node->curr_index_batch_size += data_len; + reserve_leaf_space = false; + } + + if (reserve_leaf_space) { + ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item); + /* + * Space was reserved for a dir index item insertion when we + * started the transaction, so getting a failure here should be + * impossible. + */ + if (WARN_ON(ret)) { + btrfs_release_delayed_item(delayed_item); + mutex_unlock(&delayed_node->mutex); + goto release_node; + } + + delayed_node->index_item_leaves++; + } else { + btrfs_release_dir_index_item_space(trans); + } + mutex_unlock(&delayed_node->mutex); + +release_node: + btrfs_release_delayed_node(delayed_node); + return ret; +} + +static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_node *node, + u64 index) +{ + struct btrfs_delayed_item *item; + + mutex_lock(&node->mutex); + item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index); + if (!item) { + mutex_unlock(&node->mutex); + return 1; + } + + /* + * For delayed items to insert, we track reserved metadata bytes based + * on the number of leaves that we will use. + * See btrfs_insert_delayed_dir_index() and + * btrfs_delayed_item_reserve_metadata()). + */ + ASSERT(item->bytes_reserved == 0); + ASSERT(node->index_item_leaves > 0); + + /* + * If there's only one leaf reserved, we can decrement this item from the + * current batch, otherwise we can not because we don't know which leaf + * it belongs to. With the current limit on delayed items, we rarely + * accumulate enough dir index items to fill more than one leaf (even + * when using a leaf size of 4K). + */ + if (node->index_item_leaves == 1) { + const u32 data_len = item->data_len + sizeof(struct btrfs_item); + + ASSERT(node->curr_index_batch_size >= data_len); + node->curr_index_batch_size -= data_len; + } + + btrfs_release_delayed_item(item); + + /* If we now have no more dir index items, we can release all leaves. */ + if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) { + btrfs_delayed_item_release_leaves(node, node->index_item_leaves); + node->index_item_leaves = 0; + } + + mutex_unlock(&node->mutex); + return 0; +} + +int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, u64 index) +{ + struct btrfs_delayed_node *node; + struct btrfs_delayed_item *item; + int ret; + + node = btrfs_get_or_create_delayed_node(dir); + if (IS_ERR(node)) + return PTR_ERR(node); + + ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index); + if (!ret) + goto end; + + item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM); + if (!item) { + ret = -ENOMEM; + goto end; + } + + item->index = index; + + ret = btrfs_delayed_item_reserve_metadata(trans, item); + /* + * we have reserved enough space when we start a new transaction, + * so reserving metadata failure is impossible. + */ + if (ret < 0) { + btrfs_err(trans->fs_info, +"metadata reservation failed for delayed dir item deltiona, should have been reserved"); + btrfs_release_delayed_item(item); + goto end; + } + + mutex_lock(&node->mutex); + ret = __btrfs_add_delayed_item(node, item); + if (unlikely(ret)) { + btrfs_err(trans->fs_info, + "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", + index, node->root->root_key.objectid, + node->inode_id, ret); + btrfs_delayed_item_release_metadata(dir->root, item); + btrfs_release_delayed_item(item); + } + mutex_unlock(&node->mutex); +end: + btrfs_release_delayed_node(node); + return ret; +} + +int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode) +{ + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + + if (!delayed_node) + return -ENOENT; + + /* + * Since we have held i_mutex of this directory, it is impossible that + * a new directory index is added into the delayed node and index_cnt + * is updated now. So we needn't lock the delayed node. + */ + if (!delayed_node->index_cnt) { + btrfs_release_delayed_node(delayed_node); + return -EINVAL; + } + + inode->index_cnt = delayed_node->index_cnt; + btrfs_release_delayed_node(delayed_node); + return 0; +} + +bool btrfs_readdir_get_delayed_items(struct inode *inode, + u64 last_index, + struct list_head *ins_list, + struct list_head *del_list) +{ + struct btrfs_delayed_node *delayed_node; + struct btrfs_delayed_item *item; + + delayed_node = btrfs_get_delayed_node(BTRFS_I(inode)); + if (!delayed_node) + return false; + + /* + * We can only do one readdir with delayed items at a time because of + * item->readdir_list. + */ + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); + btrfs_inode_lock(BTRFS_I(inode), 0); + + mutex_lock(&delayed_node->mutex); + item = __btrfs_first_delayed_insertion_item(delayed_node); + while (item && item->index <= last_index) { + refcount_inc(&item->refs); + list_add_tail(&item->readdir_list, ins_list); + item = __btrfs_next_delayed_item(item); + } + + item = __btrfs_first_delayed_deletion_item(delayed_node); + while (item && item->index <= last_index) { + refcount_inc(&item->refs); + list_add_tail(&item->readdir_list, del_list); + item = __btrfs_next_delayed_item(item); + } + mutex_unlock(&delayed_node->mutex); + /* + * This delayed node is still cached in the btrfs inode, so refs + * must be > 1 now, and we needn't check it is going to be freed + * or not. + * + * Besides that, this function is used to read dir, we do not + * insert/delete delayed items in this period. So we also needn't + * requeue or dequeue this delayed node. + */ + refcount_dec(&delayed_node->refs); + + return true; +} + +void btrfs_readdir_put_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list) +{ + struct btrfs_delayed_item *curr, *next; + + list_for_each_entry_safe(curr, next, ins_list, readdir_list) { + list_del(&curr->readdir_list); + if (refcount_dec_and_test(&curr->refs)) + kfree(curr); + } + + list_for_each_entry_safe(curr, next, del_list, readdir_list) { + list_del(&curr->readdir_list); + if (refcount_dec_and_test(&curr->refs)) + kfree(curr); + } + + /* + * The VFS is going to do up_read(), so we need to downgrade back to a + * read lock. + */ + downgrade_write(&inode->i_rwsem); +} + +int btrfs_should_delete_dir_index(struct list_head *del_list, + u64 index) +{ + struct btrfs_delayed_item *curr; + int ret = 0; + + list_for_each_entry(curr, del_list, readdir_list) { + if (curr->index > index) + break; + if (curr->index == index) { + ret = 1; + break; + } + } + return ret; +} + +/* + * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree + * + */ +int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, + struct list_head *ins_list) +{ + struct btrfs_dir_item *di; + struct btrfs_delayed_item *curr, *next; + struct btrfs_key location; + char *name; + int name_len; + int over = 0; + unsigned char d_type; + + /* + * Changing the data of the delayed item is impossible. So + * we needn't lock them. And we have held i_mutex of the + * directory, nobody can delete any directory indexes now. + */ + list_for_each_entry_safe(curr, next, ins_list, readdir_list) { + list_del(&curr->readdir_list); + + if (curr->index < ctx->pos) { + if (refcount_dec_and_test(&curr->refs)) + kfree(curr); + continue; + } + + ctx->pos = curr->index; + + di = (struct btrfs_dir_item *)curr->data; + name = (char *)(di + 1); + name_len = btrfs_stack_dir_name_len(di); + + d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type)); + btrfs_disk_key_to_cpu(&location, &di->location); + + over = !dir_emit(ctx, name, name_len, + location.objectid, d_type); + + if (refcount_dec_and_test(&curr->refs)) + kfree(curr); + + if (over) + return 1; + ctx->pos++; + } + return 0; +} + +static void fill_stack_inode_item(struct btrfs_trans_handle *trans, + struct btrfs_inode_item *inode_item, + struct inode *inode) +{ + u64 flags; + + btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode)); + btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode)); + btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); + btrfs_set_stack_inode_mode(inode_item, inode->i_mode); + btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink); + btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); + btrfs_set_stack_inode_generation(inode_item, + BTRFS_I(inode)->generation); + btrfs_set_stack_inode_sequence(inode_item, + inode_peek_iversion(inode)); + btrfs_set_stack_inode_transid(inode_item, trans->transid); + btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); + flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, + BTRFS_I(inode)->ro_flags); + btrfs_set_stack_inode_flags(inode_item, flags); + btrfs_set_stack_inode_block_group(inode_item, 0); + + btrfs_set_stack_timespec_sec(&inode_item->atime, + inode->i_atime.tv_sec); + btrfs_set_stack_timespec_nsec(&inode_item->atime, + inode->i_atime.tv_nsec); + + btrfs_set_stack_timespec_sec(&inode_item->mtime, + inode->i_mtime.tv_sec); + btrfs_set_stack_timespec_nsec(&inode_item->mtime, + inode->i_mtime.tv_nsec); + + btrfs_set_stack_timespec_sec(&inode_item->ctime, + inode_get_ctime(inode).tv_sec); + btrfs_set_stack_timespec_nsec(&inode_item->ctime, + inode_get_ctime(inode).tv_nsec); + + btrfs_set_stack_timespec_sec(&inode_item->otime, + BTRFS_I(inode)->i_otime.tv_sec); + btrfs_set_stack_timespec_nsec(&inode_item->otime, + BTRFS_I(inode)->i_otime.tv_nsec); +} + +int btrfs_fill_inode(struct inode *inode, u32 *rdev) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_delayed_node *delayed_node; + struct btrfs_inode_item *inode_item; + + delayed_node = btrfs_get_delayed_node(BTRFS_I(inode)); + if (!delayed_node) + return -ENOENT; + + mutex_lock(&delayed_node->mutex); + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return -ENOENT; + } + + inode_item = &delayed_node->inode_item; + + i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); + i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); + btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item)); + btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, + round_up(i_size_read(inode), fs_info->sectorsize)); + inode->i_mode = btrfs_stack_inode_mode(inode_item); + set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); + inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); + BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); + BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item); + + inode_set_iversion_queried(inode, + btrfs_stack_inode_sequence(inode_item)); + inode->i_rdev = 0; + *rdev = btrfs_stack_inode_rdev(inode_item); + btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item), + &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); + + inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime); + inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime); + + inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime); + inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime); + + inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime), + btrfs_stack_timespec_nsec(&inode_item->ctime)); + + BTRFS_I(inode)->i_otime.tv_sec = + btrfs_stack_timespec_sec(&inode_item->otime); + BTRFS_I(inode)->i_otime.tv_nsec = + btrfs_stack_timespec_nsec(&inode_item->otime); + + inode->i_generation = BTRFS_I(inode)->generation; + BTRFS_I(inode)->index_cnt = (u64)-1; + + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; +} + +int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_inode *inode) +{ + struct btrfs_delayed_node *delayed_node; + int ret = 0; + + delayed_node = btrfs_get_or_create_delayed_node(inode); + if (IS_ERR(delayed_node)) + return PTR_ERR(delayed_node); + + mutex_lock(&delayed_node->mutex); + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { + fill_stack_inode_item(trans, &delayed_node->inode_item, + &inode->vfs_inode); + goto release_node; + } + + ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); + if (ret) + goto release_node; + + fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode); + set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); + delayed_node->count++; + atomic_inc(&root->fs_info->delayed_root->items); +release_node: + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return ret; +} + +int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_delayed_node *delayed_node; + + /* + * we don't do delayed inode updates during log recovery because it + * leads to enospc problems. This means we also can't do + * delayed inode refs + */ + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + return -EAGAIN; + + delayed_node = btrfs_get_or_create_delayed_node(inode); + if (IS_ERR(delayed_node)) + return PTR_ERR(delayed_node); + + /* + * We don't reserve space for inode ref deletion is because: + * - We ONLY do async inode ref deletion for the inode who has only + * one link(i_nlink == 1), it means there is only one inode ref. + * And in most case, the inode ref and the inode item are in the + * same leaf, and we will deal with them at the same time. + * Since we are sure we will reserve the space for the inode item, + * it is unnecessary to reserve space for inode ref deletion. + * - If the inode ref and the inode item are not in the same leaf, + * We also needn't worry about enospc problem, because we reserve + * much more space for the inode update than it needs. + * - At the worst, we can steal some space from the global reservation. + * It is very rare. + */ + mutex_lock(&delayed_node->mutex); + if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) + goto release_node; + + set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); + delayed_node->count++; + atomic_inc(&fs_info->delayed_root->items); +release_node: + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; +} + +static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) +{ + struct btrfs_root *root = delayed_node->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_delayed_item *curr_item, *prev_item; + + mutex_lock(&delayed_node->mutex); + curr_item = __btrfs_first_delayed_insertion_item(delayed_node); + while (curr_item) { + prev_item = curr_item; + curr_item = __btrfs_next_delayed_item(prev_item); + btrfs_release_delayed_item(prev_item); + } + + if (delayed_node->index_item_leaves > 0) { + btrfs_delayed_item_release_leaves(delayed_node, + delayed_node->index_item_leaves); + delayed_node->index_item_leaves = 0; + } + + curr_item = __btrfs_first_delayed_deletion_item(delayed_node); + while (curr_item) { + btrfs_delayed_item_release_metadata(root, curr_item); + prev_item = curr_item; + curr_item = __btrfs_next_delayed_item(prev_item); + btrfs_release_delayed_item(prev_item); + } + + btrfs_release_delayed_iref(delayed_node); + + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { + btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false); + btrfs_release_delayed_inode(delayed_node); + } + mutex_unlock(&delayed_node->mutex); +} + +void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) +{ + struct btrfs_delayed_node *delayed_node; + + delayed_node = btrfs_get_delayed_node(inode); + if (!delayed_node) + return; + + __btrfs_kill_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node); +} + +void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) +{ + u64 inode_id = 0; + struct btrfs_delayed_node *delayed_nodes[8]; + int i, n; + + while (1) { + spin_lock(&root->inode_lock); + n = radix_tree_gang_lookup(&root->delayed_nodes_tree, + (void **)delayed_nodes, inode_id, + ARRAY_SIZE(delayed_nodes)); + if (!n) { + spin_unlock(&root->inode_lock); + break; + } + + inode_id = delayed_nodes[n - 1]->inode_id + 1; + for (i = 0; i < n; i++) { + /* + * Don't increase refs in case the node is dead and + * about to be removed from the tree in the loop below + */ + if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) + delayed_nodes[i] = NULL; + } + spin_unlock(&root->inode_lock); + + for (i = 0; i < n; i++) { + if (!delayed_nodes[i]) + continue; + __btrfs_kill_delayed_node(delayed_nodes[i]); + btrfs_release_delayed_node(delayed_nodes[i]); + } + } +} + +void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info) +{ + struct btrfs_delayed_node *curr_node, *prev_node; + + curr_node = btrfs_first_delayed_node(fs_info->delayed_root); + while (curr_node) { + __btrfs_kill_delayed_node(curr_node); + + prev_node = curr_node; + curr_node = btrfs_next_delayed_node(curr_node); + btrfs_release_delayed_node(prev_node); + } +} + +void btrfs_log_get_delayed_items(struct btrfs_inode *inode, + struct list_head *ins_list, + struct list_head *del_list) +{ + struct btrfs_delayed_node *node; + struct btrfs_delayed_item *item; + + node = btrfs_get_delayed_node(inode); + if (!node) + return; + + mutex_lock(&node->mutex); + item = __btrfs_first_delayed_insertion_item(node); + while (item) { + /* + * It's possible that the item is already in a log list. This + * can happen in case two tasks are trying to log the same + * directory. For example if we have tasks A and task B: + * + * Task A collected the delayed items into a log list while + * under the inode's log_mutex (at btrfs_log_inode()), but it + * only releases the items after logging the inodes they point + * to (if they are new inodes), which happens after unlocking + * the log mutex; + * + * Task B enters btrfs_log_inode() and acquires the log_mutex + * of the same directory inode, before task B releases the + * delayed items. This can happen for example when logging some + * inode we need to trigger logging of its parent directory, so + * logging two files that have the same parent directory can + * lead to this. + * + * If this happens, just ignore delayed items already in a log + * list. All the tasks logging the directory are under a log + * transaction and whichever finishes first can not sync the log + * before the other completes and leaves the log transaction. + */ + if (!item->logged && list_empty(&item->log_list)) { + refcount_inc(&item->refs); + list_add_tail(&item->log_list, ins_list); + } + item = __btrfs_next_delayed_item(item); + } + + item = __btrfs_first_delayed_deletion_item(node); + while (item) { + /* It may be non-empty, for the same reason mentioned above. */ + if (!item->logged && list_empty(&item->log_list)) { + refcount_inc(&item->refs); + list_add_tail(&item->log_list, del_list); + } + item = __btrfs_next_delayed_item(item); + } + mutex_unlock(&node->mutex); + + /* + * We are called during inode logging, which means the inode is in use + * and can not be evicted before we finish logging the inode. So we never + * have the last reference on the delayed inode. + * Also, we don't use btrfs_release_delayed_node() because that would + * requeue the delayed inode (change its order in the list of prepared + * nodes) and we don't want to do such change because we don't create or + * delete delayed items. + */ + ASSERT(refcount_read(&node->refs) > 1); + refcount_dec(&node->refs); +} + +void btrfs_log_put_delayed_items(struct btrfs_inode *inode, + struct list_head *ins_list, + struct list_head *del_list) +{ + struct btrfs_delayed_node *node; + struct btrfs_delayed_item *item; + struct btrfs_delayed_item *next; + + node = btrfs_get_delayed_node(inode); + if (!node) + return; + + mutex_lock(&node->mutex); + + list_for_each_entry_safe(item, next, ins_list, log_list) { + item->logged = true; + list_del_init(&item->log_list); + if (refcount_dec_and_test(&item->refs)) + kfree(item); + } + + list_for_each_entry_safe(item, next, del_list, log_list) { + item->logged = true; + list_del_init(&item->log_list); + if (refcount_dec_and_test(&item->refs)) + kfree(item); + } + + mutex_unlock(&node->mutex); + + /* + * We are called during inode logging, which means the inode is in use + * and can not be evicted before we finish logging the inode. So we never + * have the last reference on the delayed inode. + * Also, we don't use btrfs_release_delayed_node() because that would + * requeue the delayed inode (change its order in the list of prepared + * nodes) and we don't want to do such change because we don't create or + * delete delayed items. + */ + ASSERT(refcount_read(&node->refs) > 1); + refcount_dec(&node->refs); +} diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h new file mode 100644 index 0000000000..1da213197f --- /dev/null +++ b/fs/btrfs/delayed-inode.h @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2011 Fujitsu. All rights reserved. + * Written by Miao Xie + */ + +#ifndef BTRFS_DELAYED_INODE_H +#define BTRFS_DELAYED_INODE_H + +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" + +enum btrfs_delayed_item_type { + BTRFS_DELAYED_INSERTION_ITEM, + BTRFS_DELAYED_DELETION_ITEM +}; + +struct btrfs_delayed_root { + spinlock_t lock; + struct list_head node_list; + /* + * Used for delayed nodes which is waiting to be dealt with by the + * worker. If the delayed node is inserted into the work queue, we + * drop it from this list. + */ + struct list_head prepare_list; + atomic_t items; /* for delayed items */ + atomic_t items_seq; /* for delayed items */ + int nodes; /* for delayed nodes */ + wait_queue_head_t wait; +}; + +#define BTRFS_DELAYED_NODE_IN_LIST 0 +#define BTRFS_DELAYED_NODE_INODE_DIRTY 1 +#define BTRFS_DELAYED_NODE_DEL_IREF 2 + +struct btrfs_delayed_node { + u64 inode_id; + u64 bytes_reserved; + struct btrfs_root *root; + /* Used to add the node into the delayed root's node list. */ + struct list_head n_list; + /* + * Used to add the node into the prepare list, the nodes in this list + * is waiting to be dealt with by the async worker. + */ + struct list_head p_list; + struct rb_root_cached ins_root; + struct rb_root_cached del_root; + struct mutex mutex; + struct btrfs_inode_item inode_item; + refcount_t refs; + u64 index_cnt; + unsigned long flags; + int count; + /* + * The size of the next batch of dir index items to insert (if this + * node is from a directory inode). Protected by @mutex. + */ + u32 curr_index_batch_size; + /* + * Number of leaves reserved for inserting dir index items (if this + * node belongs to a directory inode). This may be larger then the + * actual number of leaves we end up using. Protected by @mutex. + */ + u32 index_item_leaves; +}; + +struct btrfs_delayed_item { + struct rb_node rb_node; + /* Offset value of the corresponding dir index key. */ + u64 index; + struct list_head tree_list; /* used for batch insert/delete items */ + struct list_head readdir_list; /* used for readdir items */ + /* + * Used when logging a directory. + * Insertions and deletions to this list are protected by the parent + * delayed node's mutex. + */ + struct list_head log_list; + u64 bytes_reserved; + struct btrfs_delayed_node *delayed_node; + refcount_t refs; + enum btrfs_delayed_item_type type:8; + /* + * Track if this delayed item was already logged. + * Protected by the mutex of the parent delayed inode. + */ + bool logged; + /* The maximum leaf size is 64K, so u16 is more than enough. */ + u16 data_len; + char data[] __counted_by(data_len); +}; + +static inline void btrfs_init_delayed_root( + struct btrfs_delayed_root *delayed_root) +{ + atomic_set(&delayed_root->items, 0); + atomic_set(&delayed_root->items_seq, 0); + delayed_root->nodes = 0; + spin_lock_init(&delayed_root->lock); + init_waitqueue_head(&delayed_root->wait); + INIT_LIST_HEAD(&delayed_root->node_list); + INIT_LIST_HEAD(&delayed_root->prepare_list); +} + +int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, + const char *name, int name_len, + struct btrfs_inode *dir, + struct btrfs_disk_key *disk_key, u8 flags, + u64 index); + +int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, u64 index); + +int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode); + +int btrfs_run_delayed_items(struct btrfs_trans_handle *trans); +int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr); + +void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info); + +int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode); +/* Used for evicting the inode. */ +void btrfs_remove_delayed_node(struct btrfs_inode *inode); +void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode); +int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode); + + +int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_inode *inode); +int btrfs_fill_inode(struct inode *inode, u32 *rdev); +int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode); + +/* Used for drop dead root */ +void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); + +/* Used for clean the transaction */ +void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info); + +/* Used for readdir() */ +bool btrfs_readdir_get_delayed_items(struct inode *inode, + u64 last_index, + struct list_head *ins_list, + struct list_head *del_list); +void btrfs_readdir_put_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list); +int btrfs_should_delete_dir_index(struct list_head *del_list, + u64 index); +int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, + struct list_head *ins_list); + +/* Used during directory logging. */ +void btrfs_log_get_delayed_items(struct btrfs_inode *inode, + struct list_head *ins_list, + struct list_head *del_list); +void btrfs_log_put_delayed_items(struct btrfs_inode *inode, + struct list_head *ins_list, + struct list_head *del_list); + +/* for init */ +int __init btrfs_delayed_inode_init(void); +void __cold btrfs_delayed_inode_exit(void); + +/* for debugging */ +void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info); + +#endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c new file mode 100644 index 0000000000..9fe4ccca50 --- /dev/null +++ b/fs/btrfs/delayed-ref.c @@ -0,0 +1,1160 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2009 Oracle. All rights reserved. + */ + +#include +#include +#include +#include "messages.h" +#include "ctree.h" +#include "delayed-ref.h" +#include "transaction.h" +#include "qgroup.h" +#include "space-info.h" +#include "tree-mod-log.h" +#include "fs.h" + +struct kmem_cache *btrfs_delayed_ref_head_cachep; +struct kmem_cache *btrfs_delayed_tree_ref_cachep; +struct kmem_cache *btrfs_delayed_data_ref_cachep; +struct kmem_cache *btrfs_delayed_extent_op_cachep; +/* + * delayed back reference update tracking. For subvolume trees + * we queue up extent allocations and backref maintenance for + * delayed processing. This avoids deep call chains where we + * add extents in the middle of btrfs_search_slot, and it allows + * us to buffer up frequently modified backrefs in an rb tree instead + * of hammering updates on the extent allocation tree. + */ + +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + bool ret = false; + u64 reserved; + + spin_lock(&global_rsv->lock); + reserved = global_rsv->reserved; + spin_unlock(&global_rsv->lock); + + /* + * Since the global reserve is just kind of magic we don't really want + * to rely on it to save our bacon, so if our size is more than the + * delayed_refs_rsv and the global rsv then it's time to think about + * bailing. + */ + spin_lock(&delayed_refs_rsv->lock); + reserved += delayed_refs_rsv->reserved; + if (delayed_refs_rsv->size >= reserved) + ret = true; + spin_unlock(&delayed_refs_rsv->lock); + return ret; +} + +/* + * Release a ref head's reservation. + * + * @fs_info: the filesystem + * @nr: number of items to drop + * + * Drops the delayed ref head's count from the delayed refs rsv and free any + * excess reservation we had. + */ +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr); + u64 released = 0; + + released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); + if (released) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, released, 0); +} + +/* + * Adjust the size of the delayed refs rsv. + * + * This is to be called anytime we may have adjusted trans->delayed_ref_updates, + * it'll calculate the additional size and add it to the delayed_refs_rsv. + */ +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + u64 num_bytes; + + if (!trans->delayed_ref_updates) + return; + + num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, + trans->delayed_ref_updates); + + spin_lock(&delayed_rsv->lock); + delayed_rsv->size += num_bytes; + delayed_rsv->full = false; + spin_unlock(&delayed_rsv->lock); + trans->delayed_ref_updates = 0; +} + +/* + * Transfer bytes to our delayed refs rsv. + * + * @fs_info: the filesystem + * @num_bytes: number of bytes to transfer + * + * This transfers up to the num_bytes amount, previously reserved, to the + * delayed_refs_rsv. Any extra bytes are returned to the space info. + */ +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + u64 num_bytes) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + u64 to_free = 0; + + spin_lock(&delayed_refs_rsv->lock); + if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { + u64 delta = delayed_refs_rsv->size - + delayed_refs_rsv->reserved; + if (num_bytes > delta) { + to_free = num_bytes - delta; + num_bytes = delta; + } + } else { + to_free = num_bytes; + num_bytes = 0; + } + + if (num_bytes) + delayed_refs_rsv->reserved += num_bytes; + if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) + delayed_refs_rsv->full = true; + spin_unlock(&delayed_refs_rsv->lock); + + if (num_bytes) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + if (to_free) + btrfs_space_info_free_bytes_may_use(fs_info, + delayed_refs_rsv->space_info, to_free); +} + +/* + * Refill based on our delayed refs usage. + * + * @fs_info: the filesystem + * @flush: control how we can flush for this reservation. + * + * This will refill the delayed block_rsv up to 1 items size worth of space and + * will return -ENOSPC if we can't make the reservation. + */ +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1); + u64 num_bytes = 0; + u64 refilled_bytes; + u64 to_free; + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + num_bytes = block_rsv->size - block_rsv->reserved; + num_bytes = min(num_bytes, limit); + } + spin_unlock(&block_rsv->lock); + + if (!num_bytes) + return 0; + + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); + if (ret) + return ret; + + /* + * We may have raced with someone else, so check again if we the block + * reserve is still not full and release any excess space. + */ + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + u64 needed = block_rsv->size - block_rsv->reserved; + + if (num_bytes >= needed) { + block_rsv->reserved += needed; + block_rsv->full = true; + to_free = num_bytes - needed; + refilled_bytes = needed; + } else { + block_rsv->reserved += num_bytes; + to_free = 0; + refilled_bytes = num_bytes; + } + } else { + to_free = num_bytes; + refilled_bytes = 0; + } + spin_unlock(&block_rsv->lock); + + if (to_free > 0) + btrfs_space_info_free_bytes_may_use(fs_info, block_rsv->space_info, + to_free); + + if (refilled_bytes > 0) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, + refilled_bytes, 1); + return 0; +} + +/* + * compare two delayed tree backrefs with same bytenr and type + */ +static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1, + struct btrfs_delayed_tree_ref *ref2) +{ + if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + } else { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } + return 0; +} + +/* + * compare two delayed data backrefs with same bytenr and type + */ +static int comp_data_refs(struct btrfs_delayed_data_ref *ref1, + struct btrfs_delayed_data_ref *ref2) +{ + if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + if (ref1->objectid < ref2->objectid) + return -1; + if (ref1->objectid > ref2->objectid) + return 1; + if (ref1->offset < ref2->offset) + return -1; + if (ref1->offset > ref2->offset) + return 1; + } else { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } + return 0; +} + +static int comp_refs(struct btrfs_delayed_ref_node *ref1, + struct btrfs_delayed_ref_node *ref2, + bool check_seq) +{ + int ret = 0; + + if (ref1->type < ref2->type) + return -1; + if (ref1->type > ref2->type) + return 1; + if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || + ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) + ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1), + btrfs_delayed_node_to_tree_ref(ref2)); + else + ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1), + btrfs_delayed_node_to_data_ref(ref2)); + if (ret) + return ret; + if (check_seq) { + if (ref1->seq < ref2->seq) + return -1; + if (ref1->seq > ref2->seq) + return 1; + } + return 0; +} + +/* insert a new ref to head ref rbtree */ +static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_root.rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_delayed_ref_head *entry; + struct btrfs_delayed_ref_head *ins; + u64 bytenr; + bool leftmost = true; + + ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); + bytenr = ins->bytenr; + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, + href_node); + + if (bytenr < entry->bytenr) { + p = &(*p)->rb_left; + } else if (bytenr > entry->bytenr) { + p = &(*p)->rb_right; + leftmost = false; + } else { + return entry; + } + } + + rb_link_node(node, parent_node, p); + rb_insert_color_cached(node, root, leftmost); + return NULL; +} + +static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, + struct btrfs_delayed_ref_node *ins) +{ + struct rb_node **p = &root->rb_root.rb_node; + struct rb_node *node = &ins->ref_node; + struct rb_node *parent_node = NULL; + struct btrfs_delayed_ref_node *entry; + bool leftmost = true; + + while (*p) { + int comp; + + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, + ref_node); + comp = comp_refs(ins, entry, true); + if (comp < 0) { + p = &(*p)->rb_left; + } else if (comp > 0) { + p = &(*p)->rb_right; + leftmost = false; + } else { + return entry; + } + } + + rb_link_node(node, parent_node, p); + rb_insert_color_cached(node, root, leftmost); + return NULL; +} + +static struct btrfs_delayed_ref_head *find_first_ref_head( + struct btrfs_delayed_ref_root *dr) +{ + struct rb_node *n; + struct btrfs_delayed_ref_head *entry; + + n = rb_first_cached(&dr->href_root); + if (!n) + return NULL; + + entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); + + return entry; +} + +/* + * Find a head entry based on bytenr. This returns the delayed ref head if it + * was able to find one, or NULL if nothing was in that spot. If return_bigger + * is given, the next bigger entry is returned if no exact match is found. + */ +static struct btrfs_delayed_ref_head *find_ref_head( + struct btrfs_delayed_ref_root *dr, u64 bytenr, + bool return_bigger) +{ + struct rb_root *root = &dr->href_root.rb_root; + struct rb_node *n; + struct btrfs_delayed_ref_head *entry; + + n = root->rb_node; + entry = NULL; + while (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); + + if (bytenr < entry->bytenr) + n = n->rb_left; + else if (bytenr > entry->bytenr) + n = n->rb_right; + else + return entry; + } + if (entry && return_bigger) { + if (bytenr > entry->bytenr) { + n = rb_next(&entry->href_node); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_delayed_ref_head, + href_node); + } + return entry; + } + return NULL; +} + +int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + lockdep_assert_held(&delayed_refs->lock); + if (mutex_trylock(&head->mutex)) + return 0; + + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); + + mutex_lock(&head->mutex); + spin_lock(&delayed_refs->lock); + if (RB_EMPTY_NODE(&head->href_node)) { + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref_head(head); + return -EAGAIN; + } + btrfs_put_delayed_ref_head(head); + return 0; +} + +static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *ref) +{ + lockdep_assert_held(&head->lock); + rb_erase_cached(&ref->ref_node, &head->ref_tree); + RB_CLEAR_NODE(&ref->ref_node); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); + btrfs_put_delayed_ref(ref); + atomic_dec(&delayed_refs->num_entries); +} + +static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *ref, + u64 seq) +{ + struct btrfs_delayed_ref_node *next; + struct rb_node *node = rb_next(&ref->ref_node); + bool done = false; + + while (!done && node) { + int mod; + + next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); + node = rb_next(node); + if (seq && next->seq >= seq) + break; + if (comp_refs(ref, next, false)) + break; + + if (ref->action == next->action) { + mod = next->ref_mod; + } else { + if (ref->ref_mod < next->ref_mod) { + swap(ref, next); + done = true; + } + mod = -next->ref_mod; + } + + drop_delayed_ref(delayed_refs, head, next); + ref->ref_mod += mod; + if (ref->ref_mod == 0) { + drop_delayed_ref(delayed_refs, head, ref); + done = true; + } else { + /* + * Can't have multiples of the same ref on a tree block. + */ + WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || + ref->type == BTRFS_SHARED_BLOCK_REF_KEY); + } + } + + return done; +} + +void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_node *ref; + struct rb_node *node; + u64 seq = 0; + + lockdep_assert_held(&head->lock); + + if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) + return; + + /* We don't have too many refs to merge for data. */ + if (head->is_data) + return; + + seq = btrfs_tree_mod_log_lowest_seq(fs_info); +again: + for (node = rb_first_cached(&head->ref_tree); node; + node = rb_next(node)) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); + if (seq && ref->seq >= seq) + continue; + if (merge_ref(delayed_refs, head, ref, seq)) + goto again; + } +} + +int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) +{ + int ret = 0; + u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info); + + if (min_seq != 0 && seq >= min_seq) { + btrfs_debug(fs_info, + "holding back delayed_ref %llu, lowest is %llu", + seq, min_seq); + ret = 1; + } + + return ret; +} + +struct btrfs_delayed_ref_head *btrfs_select_ref_head( + struct btrfs_delayed_ref_root *delayed_refs) +{ + struct btrfs_delayed_ref_head *head; + + lockdep_assert_held(&delayed_refs->lock); +again: + head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, + true); + if (!head && delayed_refs->run_delayed_start != 0) { + delayed_refs->run_delayed_start = 0; + head = find_first_ref_head(delayed_refs); + } + if (!head) + return NULL; + + while (head->processing) { + struct rb_node *node; + + node = rb_next(&head->href_node); + if (!node) { + if (delayed_refs->run_delayed_start == 0) + return NULL; + delayed_refs->run_delayed_start = 0; + goto again; + } + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + } + + head->processing = true; + WARN_ON(delayed_refs->num_heads_ready == 0); + delayed_refs->num_heads_ready--; + delayed_refs->run_delayed_start = head->bytenr + + head->num_bytes; + return head; +} + +void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + lockdep_assert_held(&delayed_refs->lock); + lockdep_assert_held(&head->lock); + + rb_erase_cached(&head->href_node, &delayed_refs->href_root); + RB_CLEAR_NODE(&head->href_node); + atomic_dec(&delayed_refs->num_entries); + delayed_refs->num_heads--; + if (!head->processing) + delayed_refs->num_heads_ready--; +} + +/* + * Helper to insert the ref_node to the tail or merge with tail. + * + * Return false if the ref was inserted. + * Return true if the ref was merged into an existing one (and therefore can be + * freed by the caller). + */ +static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, + struct btrfs_delayed_ref_head *href, + struct btrfs_delayed_ref_node *ref) +{ + struct btrfs_delayed_ref_node *exist; + int mod; + + spin_lock(&href->lock); + exist = tree_insert(&href->ref_tree, ref); + if (!exist) { + if (ref->action == BTRFS_ADD_DELAYED_REF) + list_add_tail(&ref->add_list, &href->ref_add_list); + atomic_inc(&root->num_entries); + spin_unlock(&href->lock); + return false; + } + + /* Now we are sure we can merge */ + if (exist->action == ref->action) { + mod = ref->ref_mod; + } else { + /* Need to change action */ + if (exist->ref_mod < ref->ref_mod) { + exist->action = ref->action; + mod = -exist->ref_mod; + exist->ref_mod = ref->ref_mod; + if (ref->action == BTRFS_ADD_DELAYED_REF) + list_add_tail(&exist->add_list, + &href->ref_add_list); + else if (ref->action == BTRFS_DROP_DELAYED_REF) { + ASSERT(!list_empty(&exist->add_list)); + list_del(&exist->add_list); + } else { + ASSERT(0); + } + } else + mod = -ref->ref_mod; + } + exist->ref_mod += mod; + + /* remove existing tail if its ref_mod is zero */ + if (exist->ref_mod == 0) + drop_delayed_ref(root, href, exist); + spin_unlock(&href->lock); + return true; +} + +/* + * helper function to update the accounting in the head ref + * existing and update must have the same bytenr + */ +static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *existing, + struct btrfs_delayed_ref_head *update) +{ + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; + int old_ref_mod; + + BUG_ON(existing->is_data != update->is_data); + + spin_lock(&existing->lock); + if (update->must_insert_reserved) { + /* if the extent was freed and then + * reallocated before the delayed ref + * entries were processed, we can end up + * with an existing head ref without + * the must_insert_reserved flag set. + * Set it again here + */ + existing->must_insert_reserved = update->must_insert_reserved; + + /* + * update the num_bytes so we make sure the accounting + * is done correctly + */ + existing->num_bytes = update->num_bytes; + + } + + if (update->extent_op) { + if (!existing->extent_op) { + existing->extent_op = update->extent_op; + } else { + if (update->extent_op->update_key) { + memcpy(&existing->extent_op->key, + &update->extent_op->key, + sizeof(update->extent_op->key)); + existing->extent_op->update_key = true; + } + if (update->extent_op->update_flags) { + existing->extent_op->flags_to_set |= + update->extent_op->flags_to_set; + existing->extent_op->update_flags = true; + } + btrfs_free_delayed_extent_op(update->extent_op); + } + } + /* + * update the reference mod on the head to reflect this new operation, + * only need the lock for this case cause we could be processing it + * currently, for refs we just added we know we're a-ok. + */ + old_ref_mod = existing->total_ref_mod; + existing->ref_mod += update->ref_mod; + existing->total_ref_mod += update->ref_mod; + + /* + * If we are going to from a positive ref mod to a negative or vice + * versa we need to make sure to adjust pending_csums accordingly. + */ + if (existing->is_data) { + u64 csum_leaves = + btrfs_csum_bytes_to_leaves(fs_info, + existing->num_bytes); + + if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { + delayed_refs->pending_csums -= existing->num_bytes; + btrfs_delayed_refs_rsv_release(fs_info, csum_leaves); + } + if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { + delayed_refs->pending_csums += existing->num_bytes; + trans->delayed_ref_updates += csum_leaves; + } + } + + spin_unlock(&existing->lock); +} + +static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr, u64 num_bytes, u64 ref_root, + u64 reserved, int action, bool is_data, + bool is_system) +{ + int count_mod = 1; + bool must_insert_reserved = false; + + /* If reserved is provided, it must be a data extent. */ + BUG_ON(!is_data && reserved); + + switch (action) { + case BTRFS_UPDATE_DELAYED_HEAD: + count_mod = 0; + break; + case BTRFS_DROP_DELAYED_REF: + /* + * The head node stores the sum of all the mods, so dropping a ref + * should drop the sum in the head node by one. + */ + count_mod = -1; + break; + case BTRFS_ADD_DELAYED_EXTENT: + /* + * BTRFS_ADD_DELAYED_EXTENT means that we need to update the + * reserved accounting when the extent is finally added, or if a + * later modification deletes the delayed ref without ever + * inserting the extent into the extent allocation tree. + * ref->must_insert_reserved is the flag used to record that + * accounting mods are required. + * + * Once we record must_insert_reserved, switch the action to + * BTRFS_ADD_DELAYED_REF because other special casing is not + * required. + */ + must_insert_reserved = true; + break; + } + + refcount_set(&head_ref->refs, 1); + head_ref->bytenr = bytenr; + head_ref->num_bytes = num_bytes; + head_ref->ref_mod = count_mod; + head_ref->must_insert_reserved = must_insert_reserved; + head_ref->is_data = is_data; + head_ref->is_system = is_system; + head_ref->ref_tree = RB_ROOT_CACHED; + INIT_LIST_HEAD(&head_ref->ref_add_list); + RB_CLEAR_NODE(&head_ref->href_node); + head_ref->processing = false; + head_ref->total_ref_mod = count_mod; + spin_lock_init(&head_ref->lock); + mutex_init(&head_ref->mutex); + + if (qrecord) { + if (ref_root && reserved) { + qrecord->data_rsv = reserved; + qrecord->data_rsv_refroot = ref_root; + } + qrecord->bytenr = bytenr; + qrecord->num_bytes = num_bytes; + qrecord->old_roots = NULL; + } +} + +/* + * helper function to actually insert a head node into the rbtree. + * this does all the dirty work in terms of maintaining the correct + * overall modification count. + */ +static noinline struct btrfs_delayed_ref_head * +add_delayed_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head_ref, + struct btrfs_qgroup_extent_record *qrecord, + int action, bool *qrecord_inserted_ret) +{ + struct btrfs_delayed_ref_head *existing; + struct btrfs_delayed_ref_root *delayed_refs; + bool qrecord_inserted = false; + + delayed_refs = &trans->transaction->delayed_refs; + + /* Record qgroup extent info if provided */ + if (qrecord) { + if (btrfs_qgroup_trace_extent_nolock(trans->fs_info, + delayed_refs, qrecord)) + kfree(qrecord); + else + qrecord_inserted = true; + } + + trace_add_delayed_ref_head(trans->fs_info, head_ref, action); + + existing = htree_insert(&delayed_refs->href_root, + &head_ref->href_node); + if (existing) { + update_existing_head_ref(trans, existing, head_ref); + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); + head_ref = existing; + } else { + if (head_ref->is_data && head_ref->ref_mod < 0) { + delayed_refs->pending_csums += head_ref->num_bytes; + trans->delayed_ref_updates += + btrfs_csum_bytes_to_leaves(trans->fs_info, + head_ref->num_bytes); + } + delayed_refs->num_heads++; + delayed_refs->num_heads_ready++; + atomic_inc(&delayed_refs->num_entries); + trans->delayed_ref_updates++; + } + if (qrecord_inserted_ret) + *qrecord_inserted_ret = qrecord_inserted; + + return head_ref; +} + +/* + * init_delayed_ref_common - Initialize the structure which represents a + * modification to a an extent. + * + * @fs_info: Internal to the mounted filesystem mount structure. + * + * @ref: The structure which is going to be initialized. + * + * @bytenr: The logical address of the extent for which a modification is + * going to be recorded. + * + * @num_bytes: Size of the extent whose modification is being recorded. + * + * @ref_root: The id of the root where this modification has originated, this + * can be either one of the well-known metadata trees or the + * subvolume id which references this extent. + * + * @action: Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or + * BTRFS_ADD_DELAYED_EXTENT + * + * @ref_type: Holds the type of the extent which is being recorded, can be + * one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY + * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/ + * BTRFS_EXTENT_DATA_REF_KEY when recording data extent + */ +static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, u64 ref_root, + int action, u8 ref_type) +{ + u64 seq = 0; + + if (action == BTRFS_ADD_DELAYED_EXTENT) + action = BTRFS_ADD_DELAYED_REF; + + if (is_fstree(ref_root)) + seq = atomic64_read(&fs_info->tree_mod_seq); + + refcount_set(&ref->refs, 1); + ref->bytenr = bytenr; + ref->num_bytes = num_bytes; + ref->ref_mod = 1; + ref->action = action; + ref->seq = seq; + ref->type = ref_type; + RB_CLEAR_NODE(&ref->ref_node); + INIT_LIST_HEAD(&ref->add_list); +} + +/* + * add a delayed tree ref. This does all of the accounting required + * to make sure the delayed ref is eventually processed before this + * transaction commits. + */ +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_ref *generic_ref, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_tree_ref *ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *record = NULL; + bool qrecord_inserted; + bool is_system; + bool merged; + int action = generic_ref->action; + int level = generic_ref->tree_ref.level; + u64 bytenr = generic_ref->bytenr; + u64 num_bytes = generic_ref->len; + u64 parent = generic_ref->parent; + u8 ref_type; + + is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID); + + ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); + ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); + if (!ref) + return -ENOMEM; + + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); + if (!head_ref) { + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + return -ENOMEM; + } + + if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && + !generic_ref->skip_qgroup) { + record = kzalloc(sizeof(*record), GFP_NOFS); + if (!record) { + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); + return -ENOMEM; + } + } + + if (parent) + ref_type = BTRFS_SHARED_BLOCK_REF_KEY; + else + ref_type = BTRFS_TREE_BLOCK_REF_KEY; + + init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, + generic_ref->tree_ref.owning_root, action, + ref_type); + ref->root = generic_ref->tree_ref.owning_root; + ref->parent = parent; + ref->level = level; + + init_delayed_ref_head(head_ref, record, bytenr, num_bytes, + generic_ref->tree_ref.owning_root, 0, action, + false, is_system); + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + /* + * insert both the head node and the new ref without dropping + * the spin lock + */ + head_ref = add_delayed_ref_head(trans, head_ref, record, + action, &qrecord_inserted); + + merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node); + spin_unlock(&delayed_refs->lock); + + /* + * Need to update the delayed_refs_rsv with any changes we may have + * made. + */ + btrfs_update_delayed_refs_rsv(trans); + + trace_add_delayed_tree_ref(fs_info, &ref->node, ref, + action == BTRFS_ADD_DELAYED_EXTENT ? + BTRFS_ADD_DELAYED_REF : action); + if (merged) + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + + if (qrecord_inserted) + btrfs_qgroup_trace_extent_post(trans, record); + + return 0; +} + +/* + * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. + */ +int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_ref *generic_ref, + u64 reserved) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_data_ref *ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *record = NULL; + bool qrecord_inserted; + int action = generic_ref->action; + bool merged; + u64 bytenr = generic_ref->bytenr; + u64 num_bytes = generic_ref->len; + u64 parent = generic_ref->parent; + u64 ref_root = generic_ref->data_ref.owning_root; + u64 owner = generic_ref->data_ref.ino; + u64 offset = generic_ref->data_ref.offset; + u8 ref_type; + + ASSERT(generic_ref->type == BTRFS_REF_DATA && action); + ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); + if (!ref) + return -ENOMEM; + + if (parent) + ref_type = BTRFS_SHARED_DATA_REF_KEY; + else + ref_type = BTRFS_EXTENT_DATA_REF_KEY; + init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, + ref_root, action, ref_type); + ref->root = ref_root; + ref->parent = parent; + ref->objectid = owner; + ref->offset = offset; + + + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); + if (!head_ref) { + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + return -ENOMEM; + } + + if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && + !generic_ref->skip_qgroup) { + record = kzalloc(sizeof(*record), GFP_NOFS); + if (!record) { + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + kmem_cache_free(btrfs_delayed_ref_head_cachep, + head_ref); + return -ENOMEM; + } + } + + init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root, + reserved, action, true, false); + head_ref->extent_op = NULL; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + /* + * insert both the head node and the new ref without dropping + * the spin lock + */ + head_ref = add_delayed_ref_head(trans, head_ref, record, + action, &qrecord_inserted); + + merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node); + spin_unlock(&delayed_refs->lock); + + /* + * Need to update the delayed_refs_rsv with any changes we may have + * made. + */ + btrfs_update_delayed_refs_rsv(trans); + + trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, + action == BTRFS_ADD_DELAYED_EXTENT ? + BTRFS_ADD_DELAYED_REF : action); + if (merged) + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + + + if (qrecord_inserted) + return btrfs_qgroup_trace_extent_post(trans, record); + return 0; +} + +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); + if (!head_ref) + return -ENOMEM; + + init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, + BTRFS_UPDATE_DELAYED_HEAD, false, false); + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, + NULL); + + spin_unlock(&delayed_refs->lock); + + /* + * Need to update the delayed_refs_rsv with any changes we may have + * made. + */ + btrfs_update_delayed_refs_rsv(trans); + return 0; +} + +/* + * This does a simple search for the head node for a given extent. Returns the + * head node if found, or NULL if not. + */ +struct btrfs_delayed_ref_head * +btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) +{ + lockdep_assert_held(&delayed_refs->lock); + + return find_ref_head(delayed_refs, bytenr, false); +} + +void __cold btrfs_delayed_ref_exit(void) +{ + kmem_cache_destroy(btrfs_delayed_ref_head_cachep); + kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); + kmem_cache_destroy(btrfs_delayed_data_ref_cachep); + kmem_cache_destroy(btrfs_delayed_extent_op_cachep); +} + +int __init btrfs_delayed_ref_init(void) +{ + btrfs_delayed_ref_head_cachep = kmem_cache_create( + "btrfs_delayed_ref_head", + sizeof(struct btrfs_delayed_ref_head), 0, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_ref_head_cachep) + goto fail; + + btrfs_delayed_tree_ref_cachep = kmem_cache_create( + "btrfs_delayed_tree_ref", + sizeof(struct btrfs_delayed_tree_ref), 0, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_tree_ref_cachep) + goto fail; + + btrfs_delayed_data_ref_cachep = kmem_cache_create( + "btrfs_delayed_data_ref", + sizeof(struct btrfs_delayed_data_ref), 0, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_data_ref_cachep) + goto fail; + + btrfs_delayed_extent_op_cachep = kmem_cache_create( + "btrfs_delayed_extent_op", + sizeof(struct btrfs_delayed_extent_op), 0, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_extent_op_cachep) + goto fail; + + return 0; +fail: + btrfs_delayed_ref_exit(); + return -ENOMEM; +} diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h new file mode 100644 index 0000000000..fd9bf2b709 --- /dev/null +++ b/fs/btrfs/delayed-ref.h @@ -0,0 +1,428 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + */ + +#ifndef BTRFS_DELAYED_REF_H +#define BTRFS_DELAYED_REF_H + +#include + +/* these are the possible values of struct btrfs_delayed_ref_node->action */ +#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ +#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ +#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ +#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ + +struct btrfs_delayed_ref_node { + struct rb_node ref_node; + /* + * If action is BTRFS_ADD_DELAYED_REF, also link this node to + * ref_head->ref_add_list, then we do not need to iterate the + * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes. + */ + struct list_head add_list; + + /* the starting bytenr of the extent */ + u64 bytenr; + + /* the size of the extent */ + u64 num_bytes; + + /* seq number to keep track of insertion order */ + u64 seq; + + /* ref count on this data structure */ + refcount_t refs; + + /* + * how many refs is this entry adding or deleting. For + * head refs, this may be a negative number because it is keeping + * track of the total mods done to the reference count. + * For individual refs, this will always be a positive number + * + * It may be more than one, since it is possible for a single + * parent to have more than one ref on an extent + */ + int ref_mod; + + unsigned int action:8; + unsigned int type:8; +}; + +struct btrfs_delayed_extent_op { + struct btrfs_disk_key key; + u8 level; + bool update_key; + bool update_flags; + u64 flags_to_set; +}; + +/* + * the head refs are used to hold a lock on a given extent, which allows us + * to make sure that only one process is running the delayed refs + * at a time for a single extent. They also store the sum of all the + * reference count modifications we've queued up. + */ +struct btrfs_delayed_ref_head { + u64 bytenr; + u64 num_bytes; + /* + * For insertion into struct btrfs_delayed_ref_root::href_root. + * Keep it in the same cache line as 'bytenr' for more efficient + * searches in the rbtree. + */ + struct rb_node href_node; + /* + * the mutex is held while running the refs, and it is also + * held when checking the sum of reference modifications. + */ + struct mutex mutex; + + refcount_t refs; + + /* Protects 'ref_tree' and 'ref_add_list'. */ + spinlock_t lock; + struct rb_root_cached ref_tree; + /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */ + struct list_head ref_add_list; + + struct btrfs_delayed_extent_op *extent_op; + + /* + * This is used to track the final ref_mod from all the refs associated + * with this head ref, this is not adjusted as delayed refs are run, + * this is meant to track if we need to do the csum accounting or not. + */ + int total_ref_mod; + + /* + * This is the current outstanding mod references for this bytenr. This + * is used with lookup_extent_info to get an accurate reference count + * for a bytenr, so it is adjusted as delayed refs are run so that any + * on disk reference count + ref_mod is accurate. + */ + int ref_mod; + + /* + * when a new extent is allocated, it is just reserved in memory + * The actual extent isn't inserted into the extent allocation tree + * until the delayed ref is processed. must_insert_reserved is + * used to flag a delayed ref so the accounting can be updated + * when a full insert is done. + * + * It is possible the extent will be freed before it is ever + * inserted into the extent allocation tree. In this case + * we need to update the in ram accounting to properly reflect + * the free has happened. + */ + bool must_insert_reserved; + bool is_data; + bool is_system; + bool processing; +}; + +struct btrfs_delayed_tree_ref { + struct btrfs_delayed_ref_node node; + u64 root; + u64 parent; + int level; +}; + +struct btrfs_delayed_data_ref { + struct btrfs_delayed_ref_node node; + u64 root; + u64 parent; + u64 objectid; + u64 offset; +}; + +enum btrfs_delayed_ref_flags { + /* Indicate that we are flushing delayed refs for the commit */ + BTRFS_DELAYED_REFS_FLUSHING, +}; + +struct btrfs_delayed_ref_root { + /* head ref rbtree */ + struct rb_root_cached href_root; + + /* dirty extent records */ + struct rb_root dirty_extent_root; + + /* this spin lock protects the rbtree and the entries inside */ + spinlock_t lock; + + /* how many delayed ref updates we've queued, used by the + * throttling code + */ + atomic_t num_entries; + + /* total number of head nodes in tree */ + unsigned long num_heads; + + /* total number of head nodes ready for processing */ + unsigned long num_heads_ready; + + u64 pending_csums; + + unsigned long flags; + + u64 run_delayed_start; + + /* + * To make qgroup to skip given root. + * This is for snapshot, as btrfs_qgroup_inherit() will manually + * modify counters for snapshot and its source, so we should skip + * the snapshot in new_root/old_roots or it will get calculated twice + */ + u64 qgroup_to_skip; +}; + +enum btrfs_ref_type { + BTRFS_REF_NOT_SET, + BTRFS_REF_DATA, + BTRFS_REF_METADATA, + BTRFS_REF_LAST, +}; + +struct btrfs_data_ref { + /* For EXTENT_DATA_REF */ + + /* Original root this data extent belongs to */ + u64 owning_root; + + /* Inode which refers to this data extent */ + u64 ino; + + /* + * file_offset - extent_offset + * + * file_offset is the key.offset of the EXTENT_DATA key. + * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data. + */ + u64 offset; +}; + +struct btrfs_tree_ref { + /* + * Level of this tree block + * + * Shared for skinny (TREE_BLOCK_REF) and normal tree ref. + */ + int level; + + /* + * Root which owns this tree block. + * + * For TREE_BLOCK_REF (skinny metadata, either inline or keyed) + */ + u64 owning_root; + + /* For non-skinny metadata, no special member needed */ +}; + +struct btrfs_ref { + enum btrfs_ref_type type; + int action; + + /* + * Whether this extent should go through qgroup record. + * + * Normally false, but for certain cases like delayed subtree scan, + * setting this flag can hugely reduce qgroup overhead. + */ + bool skip_qgroup; + +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + /* Through which root is this modification. */ + u64 real_root; +#endif + u64 bytenr; + u64 len; + + /* Bytenr of the parent tree block */ + u64 parent; + union { + struct btrfs_data_ref data_ref; + struct btrfs_tree_ref tree_ref; + }; +}; + +extern struct kmem_cache *btrfs_delayed_ref_head_cachep; +extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; +extern struct kmem_cache *btrfs_delayed_data_ref_cachep; +extern struct kmem_cache *btrfs_delayed_extent_op_cachep; + +int __init btrfs_delayed_ref_init(void); +void __cold btrfs_delayed_ref_exit(void); + +static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_info, + int num_delayed_refs) +{ + u64 num_bytes; + + num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_delayed_refs); + + /* + * We have to check the mount option here because we could be enabling + * the free space tree for the first time and don't have the compat_ro + * option set yet. + * + * We need extra reservations if we have the free space tree because + * we'll have to modify that tree as well. + */ + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) + num_bytes *= 2; + + return num_bytes; +} + +static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, + int action, u64 bytenr, u64 len, u64 parent) +{ + generic_ref->action = action; + generic_ref->bytenr = bytenr; + generic_ref->len = len; + generic_ref->parent = parent; +} + +static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, + int level, u64 root, u64 mod_root, bool skip_qgroup) +{ +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + /* If @real_root not set, use @root as fallback */ + generic_ref->real_root = mod_root ?: root; +#endif + generic_ref->tree_ref.level = level; + generic_ref->tree_ref.owning_root = root; + generic_ref->type = BTRFS_REF_METADATA; + if (skip_qgroup || !(is_fstree(root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; + +} + +static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref, + u64 ref_root, u64 ino, u64 offset, u64 mod_root, + bool skip_qgroup) +{ +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + /* If @real_root not set, use @root as fallback */ + generic_ref->real_root = mod_root ?: ref_root; +#endif + generic_ref->data_ref.owning_root = ref_root; + generic_ref->data_ref.ino = ino; + generic_ref->data_ref.offset = offset; + generic_ref->type = BTRFS_REF_DATA; + if (skip_qgroup || !(is_fstree(ref_root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; +} + +static inline struct btrfs_delayed_extent_op * +btrfs_alloc_delayed_extent_op(void) +{ + return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS); +} + +static inline void +btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) +{ + if (op) + kmem_cache_free(btrfs_delayed_extent_op_cachep, op); +} + +static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) +{ + WARN_ON(refcount_read(&ref->refs) == 0); + if (refcount_dec_and_test(&ref->refs)) { + WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); + switch (ref->type) { + case BTRFS_TREE_BLOCK_REF_KEY: + case BTRFS_SHARED_BLOCK_REF_KEY: + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + case BTRFS_SHARED_DATA_REF_KEY: + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + break; + default: + BUG(); + } + } +} + +static inline u64 btrfs_ref_head_to_space_flags( + struct btrfs_delayed_ref_head *head_ref) +{ + if (head_ref->is_data) + return BTRFS_BLOCK_GROUP_DATA; + else if (head_ref->is_system) + return BTRFS_BLOCK_GROUP_SYSTEM; + return BTRFS_BLOCK_GROUP_METADATA; +} + +static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head) +{ + if (refcount_dec_and_test(&head->refs)) + kmem_cache_free(btrfs_delayed_ref_head_cachep, head); +} + +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_ref *generic_ref, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_ref *generic_ref, + u64 reserved); +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + struct btrfs_delayed_extent_op *extent_op); +void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); + +struct btrfs_delayed_ref_head * +btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + u64 bytenr); +int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); +static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) +{ + mutex_unlock(&head->mutex); +} +void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); + +struct btrfs_delayed_ref_head *btrfs_select_ref_head( + struct btrfs_delayed_ref_root *delayed_refs); + +int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); + +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush); +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + u64 num_bytes); +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); + +/* + * helper functions to cast a node into its container + */ +static inline struct btrfs_delayed_tree_ref * +btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) +{ + return container_of(node, struct btrfs_delayed_tree_ref, node); +} + +static inline struct btrfs_delayed_data_ref * +btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) +{ + return container_of(node, struct btrfs_delayed_data_ref, node); +} + +#endif diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c new file mode 100644 index 0000000000..fe6ba17a05 --- /dev/null +++ b/fs/btrfs/dev-replace.c @@ -0,0 +1,1291 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) STRATO AG 2012. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include "misc.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "dev-replace.h" +#include "sysfs.h" +#include "zoned.h" +#include "block-group.h" +#include "fs.h" +#include "accessors.h" +#include "scrub.h" + +/* + * Device replace overview + * + * [Objective] + * To copy all extents (both new and on-disk) from source device to target + * device, while still keeping the filesystem read-write. + * + * [Method] + * There are two main methods involved: + * + * - Write duplication + * + * All new writes will be written to both target and source devices, so even + * if replace gets canceled, sources device still contains up-to-date data. + * + * Location: handle_ops_on_dev_replace() from btrfs_map_block() + * Start: btrfs_dev_replace_start() + * End: btrfs_dev_replace_finishing() + * Content: Latest data/metadata + * + * - Copy existing extents + * + * This happens by re-using scrub facility, as scrub also iterates through + * existing extents from commit root. + * + * Location: scrub_write_block_to_dev_replace() from + * scrub_block_complete() + * Content: Data/meta from commit root. + * + * Due to the content difference, we need to avoid nocow write when dev-replace + * is happening. This is done by marking the block group read-only and waiting + * for NOCOW writes. + * + * After replace is done, the finishing part is done by swapping the target and + * source devices. + * + * Location: btrfs_dev_replace_update_device_in_mapping_tree() from + * btrfs_dev_replace_finishing() + */ + +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, + int scrub_ret); +static int btrfs_dev_replace_kthread(void *data); + +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID }; + struct btrfs_key key; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct extent_buffer *eb; + int slot; + int ret = 0; + struct btrfs_path *path = NULL; + int item_size; + struct btrfs_dev_replace_item *ptr; + u64 src_devid; + + if (!dev_root) + return 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_DEV_REPLACE_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); + if (ret) { +no_valid_dev_replace_entry_found: + /* + * We don't have a replace item or it's corrupted. If there is + * a replace target, fail the mount. + */ + if (btrfs_find_device(fs_info->fs_devices, &args)) { + btrfs_err(fs_info, + "found replace target device without a valid replace item"); + ret = -EUCLEAN; + goto out; + } + ret = 0; + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; + dev_replace->cont_reading_from_srcdev_mode = + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; + dev_replace->time_started = 0; + dev_replace->time_stopped = 0; + atomic64_set(&dev_replace->num_write_errors, 0); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); + dev_replace->cursor_left = 0; + dev_replace->committed_cursor_left = 0; + dev_replace->cursor_left_last_write_of_item = 0; + dev_replace->cursor_right = 0; + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + dev_replace->is_valid = 0; + dev_replace->item_needs_writeback = 0; + goto out; + } + slot = path->slots[0]; + eb = path->nodes[0]; + item_size = btrfs_item_size(eb, slot); + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); + + if (item_size != sizeof(struct btrfs_dev_replace_item)) { + btrfs_warn(fs_info, + "dev_replace entry found has unexpected size, ignore entry"); + goto no_valid_dev_replace_entry_found; + } + + src_devid = btrfs_dev_replace_src_devid(eb, ptr); + dev_replace->cont_reading_from_srcdev_mode = + btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr); + dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr); + dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr); + dev_replace->time_stopped = + btrfs_dev_replace_time_stopped(eb, ptr); + atomic64_set(&dev_replace->num_write_errors, + btrfs_dev_replace_num_write_errors(eb, ptr)); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, + btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr)); + dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr); + dev_replace->committed_cursor_left = dev_replace->cursor_left; + dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left; + dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr); + dev_replace->is_valid = 1; + + dev_replace->item_needs_writeback = 0; + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + /* + * We don't have an active replace item but if there is a + * replace target, fail the mount. + */ + if (btrfs_find_device(fs_info->fs_devices, &args)) { + btrfs_err(fs_info, +"replace without active item, run 'device scan --forget' on the target device"); + ret = -EUCLEAN; + } else { + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + } + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args); + args.devid = src_devid; + dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args); + + /* + * allow 'btrfs dev replace_cancel' if src/tgt device is + * missing + */ + if (!dev_replace->srcdev && + !btrfs_test_opt(fs_info, DEGRADED)) { + ret = -EIO; + btrfs_warn(fs_info, + "cannot mount because device replace operation is ongoing and"); + btrfs_warn(fs_info, + "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?", + src_devid); + } + if (!dev_replace->tgtdev && + !btrfs_test_opt(fs_info, DEGRADED)) { + ret = -EIO; + btrfs_warn(fs_info, + "cannot mount because device replace operation is ongoing and"); + btrfs_warn(fs_info, + "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?", + BTRFS_DEV_REPLACE_DEVID); + } + if (dev_replace->tgtdev) { + if (dev_replace->srcdev) { + dev_replace->tgtdev->total_bytes = + dev_replace->srcdev->total_bytes; + dev_replace->tgtdev->disk_total_bytes = + dev_replace->srcdev->disk_total_bytes; + dev_replace->tgtdev->commit_total_bytes = + dev_replace->srcdev->commit_total_bytes; + dev_replace->tgtdev->bytes_used = + dev_replace->srcdev->bytes_used; + dev_replace->tgtdev->commit_bytes_used = + dev_replace->srcdev->commit_bytes_used; + } + set_bit(BTRFS_DEV_STATE_REPLACE_TGT, + &dev_replace->tgtdev->dev_state); + + WARN_ON(fs_info->fs_devices->rw_devices == 0); + dev_replace->tgtdev->io_width = fs_info->sectorsize; + dev_replace->tgtdev->io_align = fs_info->sectorsize; + dev_replace->tgtdev->sector_size = fs_info->sectorsize; + dev_replace->tgtdev->fs_info = fs_info; + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &dev_replace->tgtdev->dev_state); + } + break; + } + +out: + btrfs_free_path(path); + return ret; +} + +/* + * Initialize a new device for device replace target from a given source dev + * and path. + * + * Return 0 and new device in @device_out, otherwise return < 0 + */ +static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, + const char *device_path, + struct btrfs_device *srcdev, + struct btrfs_device **device_out) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + struct block_device *bdev; + u64 devid = BTRFS_DEV_REPLACE_DEVID; + int ret = 0; + + *device_out = NULL; + if (srcdev->fs_devices->seeding) { + btrfs_err(fs_info, "the filesystem is a seed filesystem!"); + return -EINVAL; + } + + bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE, + fs_info->bdev_holder, NULL); + if (IS_ERR(bdev)) { + btrfs_err(fs_info, "target device %s is invalid!", device_path); + return PTR_ERR(bdev); + } + + if (!btrfs_check_device_zone_type(fs_info, bdev)) { + btrfs_err(fs_info, + "dev-replace: zoned type of target device mismatch with filesystem"); + ret = -EINVAL; + goto error; + } + + sync_blockdev(bdev); + + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (device->bdev == bdev) { + btrfs_err(fs_info, + "target device is in the filesystem!"); + ret = -EEXIST; + goto error; + } + } + + + if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) { + btrfs_err(fs_info, + "target device is smaller than source device!"); + ret = -EINVAL; + goto error; + } + + + device = btrfs_alloc_device(NULL, &devid, NULL, device_path); + if (IS_ERR(device)) { + ret = PTR_ERR(device); + goto error; + } + + ret = lookup_bdev(device_path, &device->devt); + if (ret) + goto error; + + set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + device->generation = 0; + device->io_width = fs_info->sectorsize; + device->io_align = fs_info->sectorsize; + device->sector_size = fs_info->sectorsize; + device->total_bytes = btrfs_device_get_total_bytes(srcdev); + device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); + device->bytes_used = btrfs_device_get_bytes_used(srcdev); + device->commit_total_bytes = srcdev->commit_total_bytes; + device->commit_bytes_used = device->bytes_used; + device->fs_info = fs_info; + device->bdev = bdev; + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); + device->holder = fs_info->bdev_holder; + device->dev_stats_valid = 1; + set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); + device->fs_devices = fs_devices; + + ret = btrfs_get_dev_zone_info(device, false); + if (ret) + goto error; + + mutex_lock(&fs_devices->device_list_mutex); + list_add(&device->dev_list, &fs_devices->devices); + fs_devices->num_devices++; + fs_devices->open_devices++; + mutex_unlock(&fs_devices->device_list_mutex); + + *device_out = device; + return 0; + +error: + blkdev_put(bdev, fs_info->bdev_holder); + return ret; +} + +/* + * called from commit_transaction. Writes changed device replace state to + * disk. + */ +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_dev_replace_item *ptr; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + down_read(&dev_replace->rwsem); + if (!dev_replace->is_valid || + !dev_replace->item_needs_writeback) { + up_read(&dev_replace->rwsem); + return 0; + } + up_read(&dev_replace->rwsem); + + key.objectid = 0; + key.type = BTRFS_DEV_REPLACE_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); + if (ret < 0) { + btrfs_warn(fs_info, + "error %d while searching for dev_replace item!", + ret); + goto out; + } + + if (ret == 0 && + btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + /* + * need to delete old one and insert a new one. + * Since no attempt is made to recover any old state, if the + * dev_replace state is 'running', the data on the target + * drive is lost. + * It would be possible to recover the state: just make sure + * that the beginning of the item is never changed and always + * contains all the essential information. Then read this + * minimal set of information and use it as a base for the + * new state. + */ + ret = btrfs_del_item(trans, dev_root, path); + if (ret != 0) { + btrfs_warn(fs_info, + "delete too small dev_replace item failed %d!", + ret); + goto out; + } + ret = 1; + } + + if (ret == 1) { + /* need to insert a new item */ + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, dev_root, path, + &key, sizeof(*ptr)); + if (ret < 0) { + btrfs_warn(fs_info, + "insert dev_replace item failed %d!", ret); + goto out; + } + } + + eb = path->nodes[0]; + ptr = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_dev_replace_item); + + down_write(&dev_replace->rwsem); + if (dev_replace->srcdev) + btrfs_set_dev_replace_src_devid(eb, ptr, + dev_replace->srcdev->devid); + else + btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1); + btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr, + dev_replace->cont_reading_from_srcdev_mode); + btrfs_set_dev_replace_replace_state(eb, ptr, + dev_replace->replace_state); + btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started); + btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped); + btrfs_set_dev_replace_num_write_errors(eb, ptr, + atomic64_read(&dev_replace->num_write_errors)); + btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr, + atomic64_read(&dev_replace->num_uncorrectable_read_errors)); + dev_replace->cursor_left_last_write_of_item = + dev_replace->cursor_left; + btrfs_set_dev_replace_cursor_left(eb, ptr, + dev_replace->cursor_left_last_write_of_item); + btrfs_set_dev_replace_cursor_right(eb, ptr, + dev_replace->cursor_right); + dev_replace->item_needs_writeback = 0; + up_write(&dev_replace->rwsem); + + btrfs_mark_buffer_dirty(trans, eb); + +out: + btrfs_free_path(path); + + return ret; +} + +static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, + struct btrfs_device *src_dev) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_block_group *cache; + struct btrfs_trans_handle *trans; + int iter_ret = 0; + int ret = 0; + u64 chunk_offset; + + /* Do not use "to_copy" on non zoned filesystem for now */ + if (!btrfs_is_zoned(fs_info)) + return 0; + + mutex_lock(&fs_info->chunk_mutex); + + /* Ensure we don't have pending new block group */ + spin_lock(&fs_info->trans_lock); + while (fs_info->running_transaction && + !list_empty(&fs_info->running_transaction->dev_update_list)) { + spin_unlock(&fs_info->trans_lock); + mutex_unlock(&fs_info->chunk_mutex); + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + mutex_lock(&fs_info->chunk_mutex); + if (ret == -ENOENT) { + spin_lock(&fs_info->trans_lock); + continue; + } else { + goto unlock; + } + } + + ret = btrfs_commit_transaction(trans); + mutex_lock(&fs_info->chunk_mutex); + if (ret) + goto unlock; + + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto unlock; + } + + path->reada = READA_FORWARD; + path->search_commit_root = 1; + path->skip_locking = 1; + + key.objectid = src_dev->devid; + key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0; + + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { + struct extent_buffer *leaf = path->nodes[0]; + + if (found_key.objectid != src_dev->devid) + break; + + if (found_key.type != BTRFS_DEV_EXTENT_KEY) + break; + + if (found_key.offset < key.offset) + break; + + dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); + + chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent); + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + if (!cache) + continue; + + set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); + btrfs_put_block_group(cache); + } + if (iter_ret < 0) + ret = iter_ret; + + btrfs_free_path(path); +unlock: + mutex_unlock(&fs_info->chunk_mutex); + + return ret; +} + +bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, + struct btrfs_block_group *cache, + u64 physical) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct extent_map *em; + struct map_lookup *map; + u64 chunk_offset = cache->start; + int num_extents, cur_extent; + int i; + + /* Do not use "to_copy" on non zoned filesystem for now */ + if (!btrfs_is_zoned(fs_info)) + return true; + + spin_lock(&cache->lock); + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { + spin_unlock(&cache->lock); + return true; + } + spin_unlock(&cache->lock); + + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + ASSERT(!IS_ERR(em)); + map = em->map_lookup; + + num_extents = 0; + cur_extent = 0; + for (i = 0; i < map->num_stripes; i++) { + /* We have more device extent to copy */ + if (srcdev != map->stripes[i].dev) + continue; + + num_extents++; + if (physical == map->stripes[i].physical) + cur_extent = i; + } + + free_extent_map(em); + + if (num_extents > 1 && cur_extent < num_extents - 1) { + /* + * Has more stripes on this device. Keep this block group + * readonly until we finish all the stripes. + */ + return false; + } + + /* Last stripe on this device */ + clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); + + return true; +} + +static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, + const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, + int read_src) +{ + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_trans_handle *trans; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int ret; + struct btrfs_device *tgt_device = NULL; + struct btrfs_device *src_device = NULL; + + src_device = btrfs_find_device_by_devspec(fs_info, srcdevid, + srcdev_name); + if (IS_ERR(src_device)) + return PTR_ERR(src_device); + + if (btrfs_pinned_by_swapfile(fs_info, src_device)) { + btrfs_warn_in_rcu(fs_info, + "cannot replace device %s (devid %llu) due to active swapfile", + btrfs_dev_name(src_device), src_device->devid); + return -ETXTBSY; + } + + /* + * Here we commit the transaction to make sure commit_total_bytes + * of all the devices are updated. + */ + trans = btrfs_attach_transaction(root); + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + } else if (PTR_ERR(trans) != -ENOENT) { + return PTR_ERR(trans); + } + + ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, + src_device, &tgt_device); + if (ret) + return ret; + + ret = mark_block_group_to_copy(fs_info, src_device); + if (ret) + return ret; + + down_write(&dev_replace->rwsem); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + ASSERT(0); + ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; + up_write(&dev_replace->rwsem); + goto leave; + } + + dev_replace->cont_reading_from_srcdev_mode = read_src; + dev_replace->srcdev = src_device; + dev_replace->tgtdev = tgt_device; + + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s started", + btrfs_dev_name(src_device), + src_device->devid, + btrfs_dev_name(tgt_device)); + + /* + * from now on, the writes to the srcdev are all duplicated to + * go to the tgtdev as well (refer to btrfs_map_block()). + */ + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; + dev_replace->time_started = ktime_get_real_seconds(); + dev_replace->cursor_left = 0; + dev_replace->committed_cursor_left = 0; + dev_replace->cursor_left_last_write_of_item = 0; + dev_replace->cursor_right = 0; + dev_replace->is_valid = 1; + dev_replace->item_needs_writeback = 1; + atomic64_set(&dev_replace->num_write_errors, 0); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); + up_write(&dev_replace->rwsem); + + ret = btrfs_sysfs_add_device(tgt_device); + if (ret) + btrfs_err(fs_info, "kobj add dev failed %d", ret); + + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + + /* + * Commit dev_replace state and reserve 1 item for it. + * This is crucial to ensure we won't miss copying extents for new block + * groups that are allocated after we started the device replace, and + * must be done after setting up the device replace state. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + down_write(&dev_replace->rwsem); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + up_write(&dev_replace->rwsem); + goto leave; + } + + ret = btrfs_commit_transaction(trans); + WARN_ON(ret); + + /* the disk copy procedure reuses the scrub code */ + ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, + btrfs_device_get_total_bytes(src_device), + &dev_replace->scrub_progress, 0, 1); + + ret = btrfs_dev_replace_finishing(fs_info, ret); + if (ret == -EINPROGRESS) + ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; + + return ret; + +leave: + btrfs_destroy_dev_replace_tgtdev(tgt_device); + return ret; +} + +int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args) +{ + int ret; + + switch (args->start.cont_reading_from_srcdev_mode) { + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: + break; + default: + return -EINVAL; + } + + if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || + args->start.tgtdev_name[0] == '\0') + return -EINVAL; + + ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name, + args->start.srcdevid, + args->start.srcdev_name, + args->start.cont_reading_from_srcdev_mode); + args->result = ret; + /* don't warn if EINPROGRESS, someone else might be running scrub */ + if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS || + ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR) + return 0; + + return ret; +} + +/* + * blocked until all in-flight bios operations are finished. + */ +static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) +{ + set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); + wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum( + &fs_info->dev_replace.bio_counter)); +} + +/* + * we have removed target device, it is safe to allow new bios request. + */ +static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) +{ + clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); + wake_up(&fs_info->dev_replace.replace_wait); +} + +/* + * When finishing the device replace, before swapping the source device with the + * target device we must update the chunk allocation state in the target device, + * as it is empty because replace works by directly copying the chunks and not + * through the normal chunk allocation path. + */ +static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_state *cached_state = NULL; + u64 start = 0; + u64 found_start; + u64 found_end; + int ret = 0; + + lockdep_assert_held(&srcdev->fs_info->chunk_mutex); + + while (find_first_extent_bit(&srcdev->alloc_state, start, + &found_start, &found_end, + CHUNK_ALLOCATED, &cached_state)) { + ret = set_extent_bit(&tgtdev->alloc_state, found_start, + found_end, CHUNK_ALLOCATED, NULL); + if (ret) + break; + start = found_end + 1; + } + + free_extent_state(cached_state); + return ret; +} + +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + u64 start = 0; + int i; + + write_lock(&em_tree->lock); + do { + em = lookup_extent_mapping(em_tree, start, (u64)-1); + if (!em) + break; + map = em->map_lookup; + for (i = 0; i < map->num_stripes; i++) + if (srcdev == map->stripes[i].dev) + map->stripes[i].dev = tgtdev; + start = em->start + em->len; + free_extent_map(em); + } while (start); + write_unlock(&em_tree->lock); +} + +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, + int scrub_ret) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *tgt_device; + struct btrfs_device *src_device; + struct btrfs_root *root = fs_info->tree_root; + u8 uuid_tmp[BTRFS_UUID_SIZE]; + struct btrfs_trans_handle *trans; + int ret = 0; + + /* don't allow cancel or unmount to disturb the finishing procedure */ + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + + down_read(&dev_replace->rwsem); + /* was the operation canceled, or is it finished? */ + if (dev_replace->replace_state != + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { + up_read(&dev_replace->rwsem); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return 0; + } + + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + up_read(&dev_replace->rwsem); + + /* + * flush all outstanding I/O and inode extent mappings before the + * copy operation is declared as being finished + */ + ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); + if (ret) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return ret; + } + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + + /* + * We have to use this loop approach because at this point src_device + * has to be available for transaction commit to complete, yet new + * chunks shouldn't be allocated on the device. + */ + while (1) { + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans); + WARN_ON(ret); + + /* Prevent write_all_supers() during the finishing procedure */ + mutex_lock(&fs_devices->device_list_mutex); + /* Prevent new chunks being allocated on the source device */ + mutex_lock(&fs_info->chunk_mutex); + + if (!list_empty(&src_device->post_commit_list)) { + mutex_unlock(&fs_devices->device_list_mutex); + mutex_unlock(&fs_info->chunk_mutex); + } else { + break; + } + } + + down_write(&dev_replace->rwsem); + dev_replace->replace_state = + scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED + : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; + dev_replace->tgtdev = NULL; + dev_replace->srcdev = NULL; + dev_replace->time_stopped = ktime_get_real_seconds(); + dev_replace->item_needs_writeback = 1; + + /* + * Update allocation state in the new device and replace the old device + * with the new one in the mapping tree. + */ + if (!scrub_ret) { + scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device); + if (scrub_ret) + goto error; + btrfs_dev_replace_update_device_in_mapping_tree(fs_info, + src_device, + tgt_device); + } else { + if (scrub_ret != -ECANCELED) + btrfs_err_in_rcu(fs_info, + "btrfs_scrub_dev(%s, %llu, %s) failed %d", + btrfs_dev_name(src_device), + src_device->devid, + btrfs_dev_name(tgt_device), scrub_ret); +error: + up_write(&dev_replace->rwsem); + mutex_unlock(&fs_info->chunk_mutex); + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_rm_dev_replace_blocked(fs_info); + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(tgt_device); + btrfs_rm_dev_replace_unblocked(fs_info); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + + return scrub_ret; + } + + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s finished", + btrfs_dev_name(src_device), + src_device->devid, + btrfs_dev_name(tgt_device)); + clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state); + tgt_device->devid = src_device->devid; + src_device->devid = BTRFS_DEV_REPLACE_DEVID; + memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); + memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); + memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); + btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes); + btrfs_device_set_disk_total_bytes(tgt_device, + src_device->disk_total_bytes); + btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); + tgt_device->commit_bytes_used = src_device->bytes_used; + + btrfs_assign_next_active_device(src_device, tgt_device); + + list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); + fs_devices->rw_devices++; + + up_write(&dev_replace->rwsem); + btrfs_rm_dev_replace_blocked(fs_info); + + btrfs_rm_dev_replace_remove_srcdev(src_device); + + btrfs_rm_dev_replace_unblocked(fs_info); + + /* + * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will + * update on-disk dev stats value during commit transaction + */ + atomic_inc(&tgt_device->dev_stats_ccnt); + + /* + * this is again a consistent state where no dev_replace procedure + * is running, the target device is part of the filesystem, the + * source device is not part of the filesystem anymore and its 1st + * superblock is scratched out so that it is no longer marked to + * belong to this filesystem. + */ + mutex_unlock(&fs_info->chunk_mutex); + mutex_unlock(&fs_devices->device_list_mutex); + + /* replace the sysfs entry */ + btrfs_sysfs_remove_device(src_device); + btrfs_sysfs_update_devid(tgt_device); + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) + btrfs_scratch_superblocks(fs_info, src_device->bdev, + src_device->name->str); + + /* write back the superblocks */ + trans = btrfs_start_transaction(root, 0); + if (!IS_ERR(trans)) + btrfs_commit_transaction(trans); + + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + + btrfs_rm_dev_replace_free_srcdev(src_device); + + return 0; +} + +/* + * Read progress of device replace status according to the state and last + * stored position. The value format is the same as for + * btrfs_dev_replace::progress_1000 + */ +static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + u64 ret = 0; + + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + ret = 0; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + ret = 1000; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + ret = div64_u64(dev_replace->cursor_left, + div_u64(btrfs_device_get_total_bytes( + dev_replace->srcdev), 1000)); + break; + } + + return ret; +} + +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + down_read(&dev_replace->rwsem); + /* even if !dev_replace_is_valid, the values are good enough for + * the replace_status ioctl */ + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + args->status.replace_state = dev_replace->replace_state; + args->status.time_started = dev_replace->time_started; + args->status.time_stopped = dev_replace->time_stopped; + args->status.num_write_errors = + atomic64_read(&dev_replace->num_write_errors); + args->status.num_uncorrectable_read_errors = + atomic64_read(&dev_replace->num_uncorrectable_read_errors); + args->status.progress_1000 = btrfs_dev_replace_progress(fs_info); + up_read(&dev_replace->rwsem); +} + +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *tgt_device = NULL; + struct btrfs_device *src_device = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = fs_info->tree_root; + int result; + int ret; + + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + down_write(&dev_replace->rwsem); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; + up_write(&dev_replace->rwsem); + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + up_write(&dev_replace->rwsem); + ret = btrfs_scrub_cancel(fs_info); + if (ret < 0) { + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; + } else { + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + /* + * btrfs_dev_replace_finishing() will handle the + * cleanup part + */ + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + } + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* + * Scrub doing the replace isn't running so we need to do the + * cleanup step of btrfs_dev_replace_finishing() here + */ + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + dev_replace->tgtdev = NULL; + dev_replace->srcdev = NULL; + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; + dev_replace->time_stopped = ktime_get_real_seconds(); + dev_replace->item_needs_writeback = 1; + + up_write(&dev_replace->rwsem); + + /* Scrub for replace must not be running in suspended state */ + btrfs_scrub_cancel(fs_info); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans); + WARN_ON(ret); + + btrfs_info_in_rcu(fs_info, + "suspended dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(tgt_device); + break; + default: + up_write(&dev_replace->rwsem); + result = -EINVAL; + } + + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return result; +} + +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + down_write(&dev_replace->rwsem); + + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + dev_replace->time_stopped = ktime_get_real_seconds(); + dev_replace->item_needs_writeback = 1; + btrfs_info(fs_info, "suspending dev_replace for unmount"); + break; + } + + up_write(&dev_replace->rwsem); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +} + +/* resume dev_replace procedure that was interrupted by unmount */ +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) +{ + struct task_struct *task; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + down_write(&dev_replace->rwsem); + + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + up_write(&dev_replace->rwsem); + return 0; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; + break; + } + if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { + btrfs_info(fs_info, + "cannot continue dev_replace, tgtdev is missing"); + btrfs_info(fs_info, + "you may cancel the operation after 'mount -o degraded'"); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + up_write(&dev_replace->rwsem); + return 0; + } + up_write(&dev_replace->rwsem); + + /* + * This could collide with a paused balance, but the exclusive op logic + * should never allow both to start and pause. We don't want to allow + * dev-replace to start anyway. + */ + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { + down_write(&dev_replace->rwsem); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + up_write(&dev_replace->rwsem); + btrfs_info(fs_info, + "cannot resume dev-replace, other exclusive operation running"); + return 0; + } + + task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); + return PTR_ERR_OR_ZERO(task); +} + +static int btrfs_dev_replace_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + u64 progress; + int ret; + + progress = btrfs_dev_replace_progress(fs_info); + progress = div_u64(progress, 10); + btrfs_info_in_rcu(fs_info, + "continuing dev_replace from %s (devid %llu) to target %s @%u%%", + btrfs_dev_name(dev_replace->srcdev), + dev_replace->srcdev->devid, + btrfs_dev_name(dev_replace->tgtdev), + (unsigned int)progress); + + ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, + dev_replace->committed_cursor_left, + btrfs_device_get_total_bytes(dev_replace->srcdev), + &dev_replace->scrub_progress, 0, 1); + ret = btrfs_dev_replace_finishing(fs_info, ret); + WARN_ON(ret && ret != -ECANCELED); + + btrfs_exclop_finish(fs_info); + return 0; +} + +int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) +{ + if (!dev_replace->is_valid) + return 0; + + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + return 0; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* + * return true even if tgtdev is missing (this is + * something that can happen if the dev_replace + * procedure is suspended by an umount and then + * the tgtdev is missing (or "btrfs dev scan") was + * not called and the filesystem is remounted + * in degraded state. This does not stop the + * dev_replace procedure. It needs to be canceled + * manually if the cancellation is wanted. + */ + break; + } + return 1; +} + +void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) +{ + percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount); + cond_wake_up_nomb(&fs_info->dev_replace.replace_wait); +} + +void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) +{ + while (1) { + percpu_counter_inc(&fs_info->dev_replace.bio_counter); + if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING, + &fs_info->fs_state))) + break; + + btrfs_bio_counter_dec(fs_info); + wait_event(fs_info->dev_replace.replace_wait, + !test_bit(BTRFS_FS_STATE_DEV_REPLACING, + &fs_info->fs_state)); + } +} diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h new file mode 100644 index 0000000000..675082ccec --- /dev/null +++ b/fs/btrfs/dev-replace.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) STRATO AG 2012. All rights reserved. + */ + +#ifndef BTRFS_DEV_REPLACE_H +#define BTRFS_DEV_REPLACE_H + +struct btrfs_ioctl_dev_replace_args; +struct btrfs_fs_info; +struct btrfs_trans_handle; +struct btrfs_dev_replace; +struct btrfs_block_group; + +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans); +int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args); +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args); +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); +int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, + struct btrfs_block_group *cache, + u64 physical); +void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); + +static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) +{ + btrfs_bio_counter_sub(fs_info, 1); +} + + +#endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c new file mode 100644 index 0000000000..9c07d5c3e5 --- /dev/null +++ b/fs/btrfs/dir-item.c @@ -0,0 +1,445 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include "messages.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "accessors.h" +#include "dir-item.h" + +/* + * insert a name into a directory, doing overflow properly if there is a hash + * collision. data_size indicates how big the item inserted should be. On + * success a struct btrfs_dir_item pointer is returned, otherwise it is + * an ERR_PTR. + * + * The name is not copied into the dir item, you have to do that yourself. + */ +static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle + *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, + u32 data_size, + const char *name, + int name_len) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + char *ptr; + struct extent_buffer *leaf; + + ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); + if (ret == -EEXIST) { + struct btrfs_dir_item *di; + di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + if (di) + return ERR_PTR(-EEXIST); + btrfs_extend_item(trans, path, data_size); + } else if (ret < 0) + return ERR_PTR(ret); + WARN_ON(ret > 0); + leaf = path->nodes[0]; + ptr = btrfs_item_ptr(leaf, path->slots[0], char); + ASSERT(data_size <= btrfs_item_size(leaf, path->slots[0])); + ptr += btrfs_item_size(leaf, path->slots[0]) - data_size; + return (struct btrfs_dir_item *)ptr; +} + +/* + * xattrs work a lot like directories, this inserts an xattr item + * into the tree + */ +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len) +{ + int ret = 0; + struct btrfs_dir_item *dir_item; + unsigned long name_ptr, data_ptr; + struct btrfs_key key, location; + struct btrfs_disk_key disk_key; + struct extent_buffer *leaf; + u32 data_size; + + if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)) + return -ENOSPC; + + key.objectid = objectid; + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = btrfs_name_hash(name, name_len); + + data_size = sizeof(*dir_item) + name_len + data_len; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + if (IS_ERR(dir_item)) + return PTR_ERR(dir_item); + memset(&location, 0, sizeof(location)); + + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, &location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_flags(leaf, dir_item, BTRFS_FT_XATTR); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + btrfs_set_dir_data_len(leaf, dir_item, data_len); + name_ptr = (unsigned long)(dir_item + 1); + data_ptr = (unsigned long)((char *)name_ptr + name_len); + + write_extent_buffer(leaf, name, name_ptr, name_len); + write_extent_buffer(leaf, data, data_ptr, data_len); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); + + return ret; +} + +/* + * insert a directory item in the tree, doing all the magic for + * both indexes. 'dir' indicates which objectid to insert it into, + * 'location' is the key to stuff into the directory item, 'type' is the + * type of the inode we're pointing to, and 'index' is the sequence number + * to use for the second index (if one is created). + * Will return 0 or -ENOMEM + */ +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + const struct fscrypt_str *name, struct btrfs_inode *dir, + struct btrfs_key *location, u8 type, u64 index) +{ + int ret = 0; + int ret2 = 0; + struct btrfs_root *root = dir->root; + struct btrfs_path *path; + struct btrfs_dir_item *dir_item; + struct extent_buffer *leaf; + unsigned long name_ptr; + struct btrfs_key key; + struct btrfs_disk_key disk_key; + u32 data_size; + + key.objectid = btrfs_ino(dir); + key.type = BTRFS_DIR_ITEM_KEY; + key.offset = btrfs_name_hash(name->name, name->len); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + btrfs_cpu_key_to_disk(&disk_key, location); + + data_size = sizeof(*dir_item) + name->len; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name->name, name->len); + if (IS_ERR(dir_item)) { + ret = PTR_ERR(dir_item); + if (ret == -EEXIST) + goto second_insert; + goto out_free; + } + + if (IS_ENCRYPTED(&dir->vfs_inode)) + type |= BTRFS_FT_ENCRYPTED; + + leaf = path->nodes[0]; + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_flags(leaf, dir_item, type); + btrfs_set_dir_data_len(leaf, dir_item, 0); + btrfs_set_dir_name_len(leaf, dir_item, name->len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + name_ptr = (unsigned long)(dir_item + 1); + + write_extent_buffer(leaf, name->name, name_ptr, name->len); + btrfs_mark_buffer_dirty(trans, leaf); + +second_insert: + /* FIXME, use some real flag for selecting the extra index */ + if (root == root->fs_info->tree_root) { + ret = 0; + goto out_free; + } + btrfs_release_path(path); + + ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir, + &disk_key, type, index); +out_free: + btrfs_free_path(path); + if (ret) + return ret; + if (ret2) + return ret2; + return 0; +} + +static struct btrfs_dir_item *btrfs_lookup_match_dir( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, const char *name, + int name_len, int mod) +{ + const int ins_len = (mod < 0 ? -1 : 0); + const int cow = (mod != 0); + int ret; + + ret = btrfs_search_slot(trans, root, key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return ERR_PTR(-ENOENT); + + return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); +} + +/* + * Lookup for a directory item by name. + * + * @trans: The transaction handle to use. Can be NULL if @mod is 0. + * @root: The root of the target tree. + * @path: Path to use for the search. + * @dir: The inode number (objectid) of the directory. + * @name: The name associated to the directory entry we are looking for. + * @name_len: The length of the name. + * @mod: Used to indicate if the tree search is meant for a read only + * lookup, for a modification lookup or for a deletion lookup, so + * its value should be 0, 1 or -1, respectively. + * + * Returns: NULL if the dir item does not exists, an error pointer if an error + * happened, or a pointer to a dir item if a dir item exists for the given name. + */ +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const struct fscrypt_str *name, + int mod) +{ + struct btrfs_key key; + struct btrfs_dir_item *di; + + key.objectid = dir; + key.type = BTRFS_DIR_ITEM_KEY; + key.offset = btrfs_name_hash(name->name, name->len); + + di = btrfs_lookup_match_dir(trans, root, path, &key, name->name, + name->len, mod); + if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) + return NULL; + + return di; +} + +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const struct fscrypt_str *name) +{ + int ret; + struct btrfs_key key; + struct btrfs_dir_item *di; + int data_size; + struct extent_buffer *leaf; + int slot; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = dir; + key.type = BTRFS_DIR_ITEM_KEY; + key.offset = btrfs_name_hash(name->name, name->len); + + di = btrfs_lookup_match_dir(NULL, root, path, &key, name->name, + name->len, 0); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + /* Nothing found, we're safe */ + if (ret == -ENOENT) { + ret = 0; + goto out; + } + + if (ret < 0) + goto out; + } + + /* we found an item, look for our name in the item */ + if (di) { + /* our exact name was found */ + ret = -EEXIST; + goto out; + } + + /* See if there is room in the item to insert this name. */ + data_size = sizeof(*di) + name->len; + leaf = path->nodes[0]; + slot = path->slots[0]; + if (data_size + btrfs_item_size(leaf, slot) + + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) { + ret = -EOVERFLOW; + } else { + /* plenty of insertion room */ + ret = 0; + } +out: + btrfs_free_path(path); + return ret; +} + +/* + * Lookup for a directory index item by name and index number. + * + * @trans: The transaction handle to use. Can be NULL if @mod is 0. + * @root: The root of the target tree. + * @path: Path to use for the search. + * @dir: The inode number (objectid) of the directory. + * @index: The index number. + * @name: The name associated to the directory entry we are looking for. + * @name_len: The length of the name. + * @mod: Used to indicate if the tree search is meant for a read only + * lookup, for a modification lookup or for a deletion lookup, so + * its value should be 0, 1 or -1, respectively. + * + * Returns: NULL if the dir index item does not exists, an error pointer if an + * error happened, or a pointer to a dir item if the dir index item exists and + * matches the criteria (name and index number). + */ +struct btrfs_dir_item * +btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 index, const struct fscrypt_str *name, int mod) +{ + struct btrfs_dir_item *di; + struct btrfs_key key; + + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = index; + + di = btrfs_lookup_match_dir(trans, root, path, &key, name->name, + name->len, mod); + if (di == ERR_PTR(-ENOENT)) + return NULL; + + return di; +} + +struct btrfs_dir_item * +btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, + u64 dirid, const struct fscrypt_str *name) +{ + struct btrfs_dir_item *di; + struct btrfs_key key; + int ret; + + key.objectid = dirid; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = 0; + + btrfs_for_each_slot(root, &key, &key, path, ret) { + if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) + break; + + di = btrfs_match_dir_item_name(root->fs_info, path, + name->name, name->len); + if (di) + return di; + } + /* Adjust return code if the key was not found in the next leaf. */ + if (ret > 0) + ret = 0; + + return ERR_PTR(ret); +} + +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod) +{ + struct btrfs_key key; + struct btrfs_dir_item *di; + + key.objectid = dir; + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = btrfs_name_hash(name, name_len); + + di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) + return NULL; + + return di; +} + +/* + * helper function to look at the directory item pointed to by 'path' + * this walks through all the entries in a dir item and finds one + * for a specific name. + */ +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + const char *name, int name_len) +{ + struct btrfs_dir_item *dir_item; + unsigned long name_ptr; + u32 total_len; + u32 cur = 0; + u32 this_len; + struct extent_buffer *leaf; + + leaf = path->nodes[0]; + dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); + + total_len = btrfs_item_size(leaf, path->slots[0]); + while (cur < total_len) { + this_len = sizeof(*dir_item) + + btrfs_dir_name_len(leaf, dir_item) + + btrfs_dir_data_len(leaf, dir_item); + name_ptr = (unsigned long)(dir_item + 1); + + if (btrfs_dir_name_len(leaf, dir_item) == name_len && + memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) + return dir_item; + + cur += this_len; + dir_item = (struct btrfs_dir_item *)((char *)dir_item + + this_len); + } + return NULL; +} + +/* + * given a pointer into a directory item, delete it. This + * handles items that have more than one entry in them. + */ +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di) +{ + + struct extent_buffer *leaf; + u32 sub_item_len; + u32 item_len; + int ret = 0; + + leaf = path->nodes[0]; + sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) + + btrfs_dir_data_len(leaf, di); + item_len = btrfs_item_size(leaf, path->slots[0]); + if (sub_item_len == item_len) { + ret = btrfs_del_item(trans, root, path); + } else { + /* MARKER */ + unsigned long ptr = (unsigned long)di; + unsigned long start; + + start = btrfs_item_ptr_offset(leaf, path->slots[0]); + memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, + item_len - (ptr + sub_item_len - start)); + btrfs_truncate_item(trans, path, item_len - sub_item_len, 1); + } + return ret; +} diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h new file mode 100644 index 0000000000..aab4b7cc7f --- /dev/null +++ b/fs/btrfs/dir-item.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DIR_ITEM_H +#define BTRFS_DIR_ITEM_H + +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const struct fscrypt_str *name); +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + const struct fscrypt_str *name, struct btrfs_inode *dir, + struct btrfs_key *location, u8 type, u64 index); +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const struct fscrypt_str *name, int mod); +struct btrfs_dir_item *btrfs_lookup_dir_index_item( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 index, const struct fscrypt_str *name, int mod); +struct btrfs_dir_item *btrfs_search_dir_index_item(struct btrfs_root *root, + struct btrfs_path *path, u64 dirid, + const struct fscrypt_str *name); +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di); +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len); +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod); +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + const char *name, + int name_len); + +#endif diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c new file mode 100644 index 0000000000..944a7340f6 --- /dev/null +++ b/fs/btrfs/discard.c @@ -0,0 +1,777 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "block-group.h" +#include "discard.h" +#include "free-space-cache.h" +#include "fs.h" + +/* + * This contains the logic to handle async discard. + * + * Async discard manages trimming of free space outside of transaction commit. + * Discarding is done by managing the block_groups on a LRU list based on free + * space recency. Two passes are used to first prioritize discarding extents + * and then allow for trimming in the bitmap the best opportunity to coalesce. + * The block_groups are maintained on multiple lists to allow for multiple + * passes with different discard filter requirements. A delayed work item is + * used to manage discarding with timeout determined by a max of the delay + * incurred by the iops rate limit, the byte rate limit, and the max delay of + * BTRFS_DISCARD_MAX_DELAY. + * + * Note, this only keeps track of block_groups that are explicitly for data. + * Mixed block_groups are not supported. + * + * The first list is special to manage discarding of fully free block groups. + * This is necessary because we issue a final trim for a full free block group + * after forgetting it. When a block group becomes unused, instead of directly + * being added to the unused_bgs list, we add it to this first list. Then + * from there, if it becomes fully discarded, we place it onto the unused_bgs + * list. + * + * The in-memory free space cache serves as the backing state for discard. + * Consequently this means there is no persistence. We opt to load all the + * block groups in as not discarded, so the mount case degenerates to the + * crashing case. + * + * As the free space cache uses bitmaps, there exists a tradeoff between + * ease/efficiency for find_free_extent() and the accuracy of discard state. + * Here we opt to let untrimmed regions merge with everything while only letting + * trimmed regions merge with other trimmed regions. This can cause + * overtrimming, but the coalescing benefit seems to be worth it. Additionally, + * bitmap state is tracked as a whole. If we're able to fully trim a bitmap, + * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in, + * this resets the state and we will retry trimming the whole bitmap. This is a + * tradeoff between discard state accuracy and the cost of accounting. + */ + +/* This is an initial delay to give some chance for block reuse */ +#define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC) +#define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC) + +#define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL) +#define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) +#define BTRFS_DISCARD_MAX_IOPS (1000U) + +/* Monotonically decreasing minimum length filters after index 0 */ +static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { + 0, + BTRFS_ASYNC_DISCARD_MAX_FILTER, + BTRFS_ASYNC_DISCARD_MIN_FILTER +}; + +static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + return &discard_ctl->discard_list[block_group->discard_index]; +} + +/* + * Determine if async discard should be running. + * + * @discard_ctl: discard control + * + * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. + */ +static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) +{ + struct btrfs_fs_info *fs_info = container_of(discard_ctl, + struct btrfs_fs_info, + discard_ctl); + + return (!(fs_info->sb->s_flags & SB_RDONLY) && + test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); +} + +static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + lockdep_assert_held(&discard_ctl->lock); + if (!btrfs_run_discard_work(discard_ctl)) + return; + + if (list_empty(&block_group->discard_list) || + block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { + if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) + block_group->discard_index = BTRFS_DISCARD_INDEX_START; + block_group->discard_eligible_time = (ktime_get_ns() + + BTRFS_DISCARD_DELAY); + block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; + } + if (list_empty(&block_group->discard_list)) + btrfs_get_block_group(block_group); + + list_move_tail(&block_group->discard_list, + get_discard_list(discard_ctl, block_group)); +} + +static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + if (!btrfs_is_block_group_data_only(block_group)) + return; + + spin_lock(&discard_ctl->lock); + __add_to_discard_list(discard_ctl, block_group); + spin_unlock(&discard_ctl->lock); +} + +static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + bool queued; + + spin_lock(&discard_ctl->lock); + + queued = !list_empty(&block_group->discard_list); + + if (!btrfs_run_discard_work(discard_ctl)) { + spin_unlock(&discard_ctl->lock); + return; + } + + list_del_init(&block_group->discard_list); + + block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED; + block_group->discard_eligible_time = (ktime_get_ns() + + BTRFS_DISCARD_UNUSED_DELAY); + block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; + if (!queued) + btrfs_get_block_group(block_group); + list_add_tail(&block_group->discard_list, + &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); + + spin_unlock(&discard_ctl->lock); +} + +static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + bool running = false; + bool queued = false; + + spin_lock(&discard_ctl->lock); + + if (block_group == discard_ctl->block_group) { + running = true; + discard_ctl->block_group = NULL; + } + + block_group->discard_eligible_time = 0; + queued = !list_empty(&block_group->discard_list); + list_del_init(&block_group->discard_list); + /* + * If the block group is currently running in the discard workfn, we + * don't want to deref it, since it's still being used by the workfn. + * The workfn will notice this case and deref the block group when it is + * finished. + */ + if (queued && !running) + btrfs_put_block_group(block_group); + + spin_unlock(&discard_ctl->lock); + + return running; +} + +/* + * Find block_group that's up next for discarding. + * + * @discard_ctl: discard control + * @now: current time + * + * Iterate over the discard lists to find the next block_group up for + * discarding checking the discard_eligible_time of block_group. + */ +static struct btrfs_block_group *find_next_block_group( + struct btrfs_discard_ctl *discard_ctl, + u64 now) +{ + struct btrfs_block_group *ret_block_group = NULL, *block_group; + int i; + + for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { + struct list_head *discard_list = &discard_ctl->discard_list[i]; + + if (!list_empty(discard_list)) { + block_group = list_first_entry(discard_list, + struct btrfs_block_group, + discard_list); + + if (!ret_block_group) + ret_block_group = block_group; + + if (ret_block_group->discard_eligible_time < now) + break; + + if (ret_block_group->discard_eligible_time > + block_group->discard_eligible_time) + ret_block_group = block_group; + } + } + + return ret_block_group; +} + +/* + * Look up next block group and set it for use. + * + * @discard_ctl: discard control + * @discard_state: the discard_state of the block_group after state management + * @discard_index: the discard_index of the block_group after state management + * @now: time when discard was invoked, in ns + * + * Wrap find_next_block_group() and set the block_group to be in use. + * @discard_state's control flow is managed here. Variables related to + * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state + * and @discard_index are remembered as it may change while we're discarding, + * but we want the discard to execute in the context determined here. + */ +static struct btrfs_block_group *peek_discard_list( + struct btrfs_discard_ctl *discard_ctl, + enum btrfs_discard_state *discard_state, + int *discard_index, u64 now) +{ + struct btrfs_block_group *block_group; + + spin_lock(&discard_ctl->lock); +again: + block_group = find_next_block_group(discard_ctl, now); + + if (block_group && now >= block_group->discard_eligible_time) { + if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && + block_group->used != 0) { + if (btrfs_is_block_group_data_only(block_group)) { + __add_to_discard_list(discard_ctl, block_group); + } else { + list_del_init(&block_group->discard_list); + btrfs_put_block_group(block_group); + } + goto again; + } + if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { + block_group->discard_cursor = block_group->start; + block_group->discard_state = BTRFS_DISCARD_EXTENTS; + } + discard_ctl->block_group = block_group; + } + if (block_group) { + *discard_state = block_group->discard_state; + *discard_index = block_group->discard_index; + } + spin_unlock(&discard_ctl->lock); + + return block_group; +} + +/* + * Update a block group's filters. + * + * @block_group: block group of interest + * @bytes: recently freed region size after coalescing + * + * Async discard maintains multiple lists with progressively smaller filters + * to prioritize discarding based on size. Should a free space that matches + * a larger filter be returned to the free_space_cache, prioritize that discard + * by moving @block_group to the proper filter. + */ +void btrfs_discard_check_filter(struct btrfs_block_group *block_group, + u64 bytes) +{ + struct btrfs_discard_ctl *discard_ctl; + + if (!block_group || + !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) + return; + + discard_ctl = &block_group->fs_info->discard_ctl; + + if (block_group->discard_index > BTRFS_DISCARD_INDEX_START && + bytes >= discard_minlen[block_group->discard_index - 1]) { + int i; + + remove_from_discard_list(discard_ctl, block_group); + + for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS; + i++) { + if (bytes >= discard_minlen[i]) { + block_group->discard_index = i; + add_to_discard_list(discard_ctl, block_group); + break; + } + } + } +} + +/* + * Move a block group along the discard lists. + * + * @discard_ctl: discard control + * @block_group: block_group of interest + * + * Increment @block_group's discard_index. If it falls of the list, let it be. + * Otherwise add it back to the appropriate list. + */ +static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + block_group->discard_index++; + if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) { + block_group->discard_index = 1; + return; + } + + add_to_discard_list(discard_ctl, block_group); +} + +/* + * Remove a block_group from the discard lists. + * + * @discard_ctl: discard control + * @block_group: block_group of interest + * + * Remove @block_group from the discard lists. If necessary, wait on the + * current work and then reschedule the delayed work. + */ +void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + if (remove_from_discard_list(discard_ctl, block_group)) { + cancel_delayed_work_sync(&discard_ctl->work); + btrfs_discard_schedule_work(discard_ctl, true); + } +} + +/* + * Handles queuing the block_groups. + * + * @discard_ctl: discard control + * @block_group: block_group of interest + * + * Maintain the LRU order of the discard lists. + */ +void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) + return; + + if (block_group->used == 0) + add_to_discard_unused_list(discard_ctl, block_group); + else + add_to_discard_list(discard_ctl, block_group); + + if (!delayed_work_pending(&discard_ctl->work)) + btrfs_discard_schedule_work(discard_ctl, false); +} + +static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, + u64 now, bool override) +{ + struct btrfs_block_group *block_group; + + if (!btrfs_run_discard_work(discard_ctl)) + return; + if (!override && delayed_work_pending(&discard_ctl->work)) + return; + + block_group = find_next_block_group(discard_ctl, now); + if (block_group) { + u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC; + u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit); + + /* + * A single delayed workqueue item is responsible for + * discarding, so we can manage the bytes rate limit by keeping + * track of the previous discard. + */ + if (kbps_limit && discard_ctl->prev_discard) { + u64 bps_limit = ((u64)kbps_limit) * SZ_1K; + u64 bps_delay = div64_u64(discard_ctl->prev_discard * + NSEC_PER_SEC, bps_limit); + + delay = max(delay, bps_delay); + } + + /* + * This timeout is to hopefully prevent immediate discarding + * in a recently allocated block group. + */ + if (now < block_group->discard_eligible_time) { + u64 bg_timeout = block_group->discard_eligible_time - now; + + delay = max(delay, bg_timeout); + } + + if (override && discard_ctl->prev_discard) { + u64 elapsed = now - discard_ctl->prev_discard_time; + + if (delay > elapsed) + delay -= elapsed; + else + delay = 0; + } + + mod_delayed_work(discard_ctl->discard_workers, + &discard_ctl->work, nsecs_to_jiffies(delay)); + } +} + +/* + * Responsible for scheduling the discard work. + * + * @discard_ctl: discard control + * @override: override the current timer + * + * Discards are issued by a delayed workqueue item. @override is used to + * update the current delay as the baseline delay interval is reevaluated on + * transaction commit. This is also maxed with any other rate limit. + */ +void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, + bool override) +{ + const u64 now = ktime_get_ns(); + + spin_lock(&discard_ctl->lock); + __btrfs_discard_schedule_work(discard_ctl, now, override); + spin_unlock(&discard_ctl->lock); +} + +/* + * Determine next step of a block_group. + * + * @discard_ctl: discard control + * @block_group: block_group of interest + * + * Determine the next step for a block group after it's finished going through + * a pass on a discard list. If it is unused and fully trimmed, we can mark it + * unused and send it to the unused_bgs path. Otherwise, pass it onto the + * appropriate filter list or let it fall off. + */ +static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + remove_from_discard_list(discard_ctl, block_group); + + if (block_group->used == 0) { + if (btrfs_is_free_space_trimmed(block_group)) + btrfs_mark_bg_unused(block_group); + else + add_to_discard_unused_list(discard_ctl, block_group); + } else { + btrfs_update_discard_index(discard_ctl, block_group); + } +} + +/* + * Discard work queue callback + * + * @work: work + * + * Find the next block_group to start discarding and then discard a single + * region. It does this in a two-pass fashion: first extents and second + * bitmaps. Completely discarded block groups are sent to the unused_bgs path. + */ +static void btrfs_discard_workfn(struct work_struct *work) +{ + struct btrfs_discard_ctl *discard_ctl; + struct btrfs_block_group *block_group; + enum btrfs_discard_state discard_state; + int discard_index = 0; + u64 trimmed = 0; + u64 minlen = 0; + u64 now = ktime_get_ns(); + + discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work); + + block_group = peek_discard_list(discard_ctl, &discard_state, + &discard_index, now); + if (!block_group || !btrfs_run_discard_work(discard_ctl)) + return; + if (now < block_group->discard_eligible_time) { + btrfs_discard_schedule_work(discard_ctl, false); + return; + } + + /* Perform discarding */ + minlen = discard_minlen[discard_index]; + + if (discard_state == BTRFS_DISCARD_BITMAPS) { + u64 maxlen = 0; + + /* + * Use the previous levels minimum discard length as the max + * length filter. In the case something is added to make a + * region go beyond the max filter, the entire bitmap is set + * back to BTRFS_TRIM_STATE_UNTRIMMED. + */ + if (discard_index != BTRFS_DISCARD_INDEX_UNUSED) + maxlen = discard_minlen[discard_index - 1]; + + btrfs_trim_block_group_bitmaps(block_group, &trimmed, + block_group->discard_cursor, + btrfs_block_group_end(block_group), + minlen, maxlen, true); + discard_ctl->discard_bitmap_bytes += trimmed; + } else { + btrfs_trim_block_group_extents(block_group, &trimmed, + block_group->discard_cursor, + btrfs_block_group_end(block_group), + minlen, true); + discard_ctl->discard_extent_bytes += trimmed; + } + + /* Determine next steps for a block_group */ + if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) { + if (discard_state == BTRFS_DISCARD_BITMAPS) { + btrfs_finish_discard_pass(discard_ctl, block_group); + } else { + block_group->discard_cursor = block_group->start; + spin_lock(&discard_ctl->lock); + if (block_group->discard_state != + BTRFS_DISCARD_RESET_CURSOR) + block_group->discard_state = + BTRFS_DISCARD_BITMAPS; + spin_unlock(&discard_ctl->lock); + } + } + + now = ktime_get_ns(); + spin_lock(&discard_ctl->lock); + discard_ctl->prev_discard = trimmed; + discard_ctl->prev_discard_time = now; + /* + * If the block group was removed from the discard list while it was + * running in this workfn, then we didn't deref it, since this function + * still owned that reference. But we set the discard_ctl->block_group + * back to NULL, so we can use that condition to know that now we need + * to deref the block_group. + */ + if (discard_ctl->block_group == NULL) + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + __btrfs_discard_schedule_work(discard_ctl, now, false); + spin_unlock(&discard_ctl->lock); +} + +/* + * Recalculate the base delay. + * + * @discard_ctl: discard control + * + * Recalculate the base delay which is based off the total number of + * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms) + * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC). + */ +void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) +{ + s32 discardable_extents; + s64 discardable_bytes; + u32 iops_limit; + unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC; + unsigned long delay; + + discardable_extents = atomic_read(&discard_ctl->discardable_extents); + if (!discardable_extents) + return; + + spin_lock(&discard_ctl->lock); + + /* + * The following is to fix a potential -1 discrepancy that we're not + * sure how to reproduce. But given that this is the only place that + * utilizes these numbers and this is only called by from + * btrfs_finish_extent_commit() which is synchronized, we can correct + * here. + */ + if (discardable_extents < 0) + atomic_add(-discardable_extents, + &discard_ctl->discardable_extents); + + discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes); + if (discardable_bytes < 0) + atomic64_add(-discardable_bytes, + &discard_ctl->discardable_bytes); + + if (discardable_extents <= 0) { + spin_unlock(&discard_ctl->lock); + return; + } + + iops_limit = READ_ONCE(discard_ctl->iops_limit); + + if (iops_limit) { + delay = MSEC_PER_SEC / iops_limit; + } else { + /* + * Unset iops_limit means go as fast as possible, so allow a + * delay of 0. + */ + delay = 0; + min_delay = 0; + } + + delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC); + discard_ctl->delay_ms = delay; + + spin_unlock(&discard_ctl->lock); +} + +/* + * Propagate discard counters. + * + * @block_group: block_group of interest + * + * Propagate deltas of counters up to the discard_ctl. It maintains a current + * counter and a previous counter passing the delta up to the global stat. + * Then the current counter value becomes the previous counter value. + */ +void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) +{ + struct btrfs_free_space_ctl *ctl; + struct btrfs_discard_ctl *discard_ctl; + s32 extents_delta; + s64 bytes_delta; + + if (!block_group || + !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) || + !btrfs_is_block_group_data_only(block_group)) + return; + + ctl = block_group->free_space_ctl; + discard_ctl = &block_group->fs_info->discard_ctl; + + lockdep_assert_held(&ctl->tree_lock); + extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] - + ctl->discardable_extents[BTRFS_STAT_PREV]; + if (extents_delta) { + atomic_add(extents_delta, &discard_ctl->discardable_extents); + ctl->discardable_extents[BTRFS_STAT_PREV] = + ctl->discardable_extents[BTRFS_STAT_CURR]; + } + + bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] - + ctl->discardable_bytes[BTRFS_STAT_PREV]; + if (bytes_delta) { + atomic64_add(bytes_delta, &discard_ctl->discardable_bytes); + ctl->discardable_bytes[BTRFS_STAT_PREV] = + ctl->discardable_bytes[BTRFS_STAT_CURR]; + } +} + +/* + * Punt unused_bgs list to discard lists. + * + * @fs_info: fs_info of interest + * + * The unused_bgs list needs to be punted to the discard lists because the + * order of operations is changed. In the normal synchronous discard path, the + * block groups are trimmed via a single large trim in transaction commit. This + * is ultimately what we are trying to avoid with asynchronous discard. Thus, + * it must be done before going down the unused_bgs path. + */ +void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group *block_group, *next; + + spin_lock(&fs_info->unused_bgs_lock); + /* We enabled async discard, so punt all to the queue */ + list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, + bg_list) { + list_del_init(&block_group->bg_list); + btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); + /* + * This put is for the get done by btrfs_mark_bg_unused. + * Queueing discard incremented it for discard's reference. + */ + btrfs_put_block_group(block_group); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + +/* + * Purge discard lists. + * + * @discard_ctl: discard control + * + * If we are disabling async discard, we may have intercepted block groups that + * are completely free and ready for the unused_bgs path. As discarding will + * now happen in transaction commit or not at all, we can safely mark the + * corresponding block groups as unused and they will be sent on their merry + * way to the unused_bgs list. + */ +static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) +{ + struct btrfs_block_group *block_group, *next; + int i; + + spin_lock(&discard_ctl->lock); + for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { + list_for_each_entry_safe(block_group, next, + &discard_ctl->discard_list[i], + discard_list) { + list_del_init(&block_group->discard_list); + spin_unlock(&discard_ctl->lock); + if (block_group->used == 0) + btrfs_mark_bg_unused(block_group); + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + } + } + spin_unlock(&discard_ctl->lock); +} + +void btrfs_discard_resume(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) { + btrfs_discard_cleanup(fs_info); + return; + } + + btrfs_discard_punt_unused_bgs_list(fs_info); + + set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); +} + +void btrfs_discard_stop(struct btrfs_fs_info *fs_info) +{ + clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); +} + +void btrfs_discard_init(struct btrfs_fs_info *fs_info) +{ + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; + int i; + + spin_lock_init(&discard_ctl->lock); + INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn); + + for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) + INIT_LIST_HEAD(&discard_ctl->discard_list[i]); + + discard_ctl->prev_discard = 0; + discard_ctl->prev_discard_time = 0; + atomic_set(&discard_ctl->discardable_extents, 0); + atomic64_set(&discard_ctl->discardable_bytes, 0); + discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE; + discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC; + discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS; + discard_ctl->kbps_limit = 0; + discard_ctl->discard_extent_bytes = 0; + discard_ctl->discard_bitmap_bytes = 0; + atomic64_set(&discard_ctl->discard_bytes_saved, 0); +} + +void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info) +{ + btrfs_discard_stop(fs_info); + cancel_delayed_work_sync(&fs_info->discard_ctl.work); + btrfs_discard_purge_list(&fs_info->discard_ctl); +} diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h new file mode 100644 index 0000000000..dddb0f9101 --- /dev/null +++ b/fs/btrfs/discard.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DISCARD_H +#define BTRFS_DISCARD_H + +#include + +struct btrfs_fs_info; +struct btrfs_discard_ctl; +struct btrfs_block_group; + +/* Discard size limits */ +#define BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE (SZ_64M) +#define BTRFS_ASYNC_DISCARD_MAX_FILTER (SZ_1M) +#define BTRFS_ASYNC_DISCARD_MIN_FILTER (SZ_32K) + +/* List operations */ +void btrfs_discard_check_filter(struct btrfs_block_group *block_group, u64 bytes); + +/* Work operations */ +void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group); +void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group); +void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, + bool override); + +/* Update operations */ +void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl); +void btrfs_discard_update_discardable(struct btrfs_block_group *block_group); + +/* Setup/cleanup operations */ +void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info); +void btrfs_discard_resume(struct btrfs_fs_info *fs_info); +void btrfs_discard_stop(struct btrfs_fs_info *fs_info); +void btrfs_discard_init(struct btrfs_fs_info *fs_info); +void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info); + +#endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c new file mode 100644 index 0000000000..b79781df70 --- /dev/null +++ b/fs/btrfs/disk-io.c @@ -0,0 +1,5005 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "bio.h" +#include "print-tree.h" +#include "locking.h" +#include "tree-log.h" +#include "free-space-cache.h" +#include "free-space-tree.h" +#include "check-integrity.h" +#include "rcu-string.h" +#include "dev-replace.h" +#include "raid56.h" +#include "sysfs.h" +#include "qgroup.h" +#include "compression.h" +#include "tree-checker.h" +#include "ref-verify.h" +#include "block-group.h" +#include "discard.h" +#include "space-info.h" +#include "zoned.h" +#include "subpage.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "defrag.h" +#include "uuid-tree.h" +#include "relocation.h" +#include "scrub.h" +#include "super.h" + +#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ + BTRFS_HEADER_FLAG_RELOC |\ + BTRFS_SUPER_FLAG_ERROR |\ + BTRFS_SUPER_FLAG_SEEDING |\ + BTRFS_SUPER_FLAG_METADUMP |\ + BTRFS_SUPER_FLAG_METADUMP_V2) + +static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info); +static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info); + +static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) +{ + if (fs_info->csum_shash) + crypto_free_shash(fs_info->csum_shash); +} + +/* + * Compute the csum of a btree block and store the result to provided buffer. + */ +static void csum_tree_block(struct extent_buffer *buf, u8 *result) +{ + struct btrfs_fs_info *fs_info = buf->fs_info; + const int num_pages = num_extent_pages(buf); + const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize); + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + char *kaddr; + int i; + + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start); + crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, + first_page_part - BTRFS_CSUM_SIZE); + + for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) { + kaddr = page_address(buf->pages[i]); + crypto_shash_update(shash, kaddr, PAGE_SIZE); + } + memset(result, 0, BTRFS_CSUM_SIZE); + crypto_shash_final(shash, result); +} + +/* + * we can't consider a given block up to date unless the transid of the + * block matches the transid in the parent node's pointer. This is how we + * detect blocks that either didn't get written at all or got written + * in the wrong place. + */ +int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic) +{ + if (!extent_buffer_uptodate(eb)) + return 0; + + if (!parent_transid || btrfs_header_generation(eb) == parent_transid) + return 1; + + if (atomic) + return -EAGAIN; + + if (!extent_buffer_uptodate(eb) || + btrfs_header_generation(eb) != parent_transid) { + btrfs_err_rl(eb->fs_info, +"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", + eb->start, eb->read_mirror, + parent_transid, btrfs_header_generation(eb)); + clear_extent_buffer_uptodate(eb); + return 0; + } + return 1; +} + +static bool btrfs_supported_super_csum(u16 csum_type) +{ + switch (csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + case BTRFS_CSUM_TYPE_XXHASH: + case BTRFS_CSUM_TYPE_SHA256: + case BTRFS_CSUM_TYPE_BLAKE2: + return true; + default: + return false; + } +} + +/* + * Return 0 if the superblock checksum type matches the checksum value of that + * algorithm. Pass the raw disk superblock data. + */ +int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *disk_sb) +{ + char result[BTRFS_CSUM_SIZE]; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + + shash->tfm = fs_info->csum_shash; + + /* + * The super_block structure does not span the whole + * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is + * filled with zeros and is included in the checksum. + */ + crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); + + if (memcmp(disk_sb->csum, result, fs_info->csum_size)) + return 1; + + return 0; +} + +static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, + int mirror_num) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + int i, num_pages = num_extent_pages(eb); + int ret = 0; + + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + for (i = 0; i < num_pages; i++) { + struct page *p = eb->pages[i]; + u64 start = max_t(u64, eb->start, page_offset(p)); + u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE); + u32 len = end - start; + + ret = btrfs_repair_io_failure(fs_info, 0, start, len, + start, p, offset_in_page(start), mirror_num); + if (ret) + break; + } + + return ret; +} + +/* + * helper to read a given tree block, doing retries as required when + * the checksums don't match and we have alternate mirrors to try. + * + * @check: expected tree parentness check, see the comments of the + * structure for details. + */ +int btrfs_read_extent_buffer(struct extent_buffer *eb, + struct btrfs_tree_parent_check *check) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + int failed = 0; + int ret; + int num_copies = 0; + int mirror_num = 0; + int failed_mirror = 0; + + ASSERT(check); + + while (1) { + clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); + ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check); + if (!ret) + break; + + num_copies = btrfs_num_copies(fs_info, + eb->start, eb->len); + if (num_copies == 1) + break; + + if (!failed_mirror) { + failed = 1; + failed_mirror = eb->read_mirror; + } + + mirror_num++; + if (mirror_num == failed_mirror) + mirror_num++; + + if (mirror_num > num_copies) + break; + } + + if (failed && !ret && failed_mirror) + btrfs_repair_eb_io_failure(eb, failed_mirror); + + return ret; +} + +/* + * Checksum a dirty tree block before IO. + */ +blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) +{ + struct extent_buffer *eb = bbio->private; + struct btrfs_fs_info *fs_info = eb->fs_info; + u64 found_start = btrfs_header_bytenr(eb); + u8 result[BTRFS_CSUM_SIZE]; + int ret; + + /* Btree blocks are always contiguous on disk. */ + if (WARN_ON_ONCE(bbio->file_offset != eb->start)) + return BLK_STS_IOERR; + if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len)) + return BLK_STS_IOERR; + + if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { + WARN_ON_ONCE(found_start != 0); + return BLK_STS_OK; + } + + if (WARN_ON_ONCE(found_start != eb->start)) + return BLK_STS_IOERR; + if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start, + eb->len))) + return BLK_STS_IOERR; + + ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, + offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE) == 0); + csum_tree_block(eb, result); + + if (btrfs_header_level(eb)) + ret = btrfs_check_node(eb); + else + ret = btrfs_check_leaf(eb); + + if (ret < 0) + goto error; + + /* + * Also check the generation, the eb reached here must be newer than + * last committed. Or something seriously wrong happened. + */ + if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) { + ret = -EUCLEAN; + btrfs_err(fs_info, + "block=%llu bad generation, have %llu expect > %llu", + eb->start, btrfs_header_generation(eb), + fs_info->last_trans_committed); + goto error; + } + write_extent_buffer(eb, result, 0, fs_info->csum_size); + return BLK_STS_OK; + +error: + btrfs_print_tree(eb, 0); + btrfs_err(fs_info, "block=%llu write time tree block corruption detected", + eb->start); + /* + * Be noisy if this is an extent buffer from a log tree. We don't abort + * a transaction in case there's a bad log tree extent buffer, we just + * fallback to a transaction commit. Still we want to know when there is + * a bad log tree extent buffer, as that may signal a bug somewhere. + */ + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) || + btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID); + return errno_to_blk_status(ret); +} + +static bool check_tree_block_fsid(struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; + u8 fsid[BTRFS_FSID_SIZE]; + + read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE); + + /* + * alloc_fs_devices() copies the fsid into metadata_uuid if the + * metadata_uuid is unset in the superblock, including for a seed device. + * So, we can use fs_devices->metadata_uuid. + */ + if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0) + return false; + + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) + if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE)) + return false; + + return true; +} + +/* Do basic extent buffer checks at read time */ +int btrfs_validate_extent_buffer(struct extent_buffer *eb, + struct btrfs_tree_parent_check *check) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + u64 found_start; + const u32 csum_size = fs_info->csum_size; + u8 found_level; + u8 result[BTRFS_CSUM_SIZE]; + const u8 *header_csum; + int ret = 0; + + ASSERT(check); + + found_start = btrfs_header_bytenr(eb); + if (found_start != eb->start) { + btrfs_err_rl(fs_info, + "bad tree block start, mirror %u want %llu have %llu", + eb->read_mirror, eb->start, found_start); + ret = -EIO; + goto out; + } + if (check_tree_block_fsid(eb)) { + btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u", + eb->start, eb->read_mirror); + ret = -EIO; + goto out; + } + found_level = btrfs_header_level(eb); + if (found_level >= BTRFS_MAX_LEVEL) { + btrfs_err(fs_info, + "bad tree block level, mirror %u level %d on logical %llu", + eb->read_mirror, btrfs_header_level(eb), eb->start); + ret = -EIO; + goto out; + } + + csum_tree_block(eb, result); + header_csum = page_address(eb->pages[0]) + + get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum)); + + if (memcmp(result, header_csum, csum_size) != 0) { + btrfs_warn_rl(fs_info, +"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d", + eb->start, eb->read_mirror, + CSUM_FMT_VALUE(csum_size, header_csum), + CSUM_FMT_VALUE(csum_size, result), + btrfs_header_level(eb)); + ret = -EUCLEAN; + goto out; + } + + if (found_level != check->level) { + btrfs_err(fs_info, + "level verify failed on logical %llu mirror %u wanted %u found %u", + eb->start, eb->read_mirror, check->level, found_level); + ret = -EIO; + goto out; + } + if (unlikely(check->transid && + btrfs_header_generation(eb) != check->transid)) { + btrfs_err_rl(eb->fs_info, +"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", + eb->start, eb->read_mirror, check->transid, + btrfs_header_generation(eb)); + ret = -EIO; + goto out; + } + if (check->has_first_key) { + struct btrfs_key *expect_key = &check->first_key; + struct btrfs_key found_key; + + if (found_level) + btrfs_node_key_to_cpu(eb, &found_key, 0); + else + btrfs_item_key_to_cpu(eb, &found_key, 0); + if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) { + btrfs_err(fs_info, +"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", + eb->start, check->transid, + expect_key->objectid, + expect_key->type, expect_key->offset, + found_key.objectid, found_key.type, + found_key.offset); + ret = -EUCLEAN; + goto out; + } + } + if (check->owner_root) { + ret = btrfs_check_eb_owner(eb, check->owner_root); + if (ret < 0) + goto out; + } + + /* + * If this is a leaf block and it is corrupt, set the corrupt bit so + * that we don't try and read the other copies of this block, just + * return -EIO. + */ + if (found_level == 0 && btrfs_check_leaf(eb)) { + set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); + ret = -EIO; + } + + if (found_level > 0 && btrfs_check_node(eb)) + ret = -EIO; + + if (ret) + btrfs_err(fs_info, + "read time tree block corruption detected on logical %llu mirror %u", + eb->start, eb->read_mirror); +out: + return ret; +} + +#ifdef CONFIG_MIGRATION +static int btree_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) +{ + /* + * we can't safely write a btree page from here, + * we haven't done the locking hook + */ + if (folio_test_dirty(src)) + return -EAGAIN; + /* + * Buffers may be managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (folio_get_private(src) && + !filemap_release_folio(src, GFP_KERNEL)) + return -EAGAIN; + return migrate_folio(mapping, dst, src, mode); +} +#else +#define btree_migrate_folio NULL +#endif + +static int btree_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct btrfs_fs_info *fs_info; + int ret; + + if (wbc->sync_mode == WB_SYNC_NONE) { + + if (wbc->for_kupdate) + return 0; + + fs_info = BTRFS_I(mapping->host)->root->fs_info; + /* this is a bit racy, but that's ok */ + ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH, + fs_info->dirty_metadata_batch); + if (ret < 0) + return 0; + } + return btree_write_cache_pages(mapping, wbc); +} + +static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags) +{ + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return false; + + return try_release_extent_buffer(&folio->page); +} + +static void btree_invalidate_folio(struct folio *folio, size_t offset, + size_t length) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(folio->mapping->host)->io_tree; + extent_invalidate_folio(tree, folio, offset); + btree_release_folio(folio, GFP_NOFS); + if (folio_get_private(folio)) { + btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info, + "folio private not zero on folio %llu", + (unsigned long long)folio_pos(folio)); + folio_detach_private(folio); + } +} + +#ifdef DEBUG +static bool btree_dirty_folio(struct address_space *mapping, + struct folio *folio) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + struct btrfs_subpage_info *spi = fs_info->subpage_info; + struct btrfs_subpage *subpage; + struct extent_buffer *eb; + int cur_bit = 0; + u64 page_start = folio_pos(folio); + + if (fs_info->sectorsize == PAGE_SIZE) { + eb = folio_get_private(folio); + BUG_ON(!eb); + BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(!atomic_read(&eb->refs)); + btrfs_assert_tree_write_locked(eb); + return filemap_dirty_folio(mapping, folio); + } + + ASSERT(spi); + subpage = folio_get_private(folio); + + for (cur_bit = spi->dirty_offset; + cur_bit < spi->dirty_offset + spi->bitmap_nr_bits; + cur_bit++) { + unsigned long flags; + u64 cur; + + spin_lock_irqsave(&subpage->lock, flags); + if (!test_bit(cur_bit, subpage->bitmaps)) { + spin_unlock_irqrestore(&subpage->lock, flags); + continue; + } + spin_unlock_irqrestore(&subpage->lock, flags); + cur = page_start + cur_bit * fs_info->sectorsize; + + eb = find_extent_buffer(fs_info, cur); + ASSERT(eb); + ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + ASSERT(atomic_read(&eb->refs)); + btrfs_assert_tree_write_locked(eb); + free_extent_buffer(eb); + + cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1; + } + return filemap_dirty_folio(mapping, folio); +} +#else +#define btree_dirty_folio filemap_dirty_folio +#endif + +static const struct address_space_operations btree_aops = { + .writepages = btree_writepages, + .release_folio = btree_release_folio, + .invalidate_folio = btree_invalidate_folio, + .migrate_folio = btree_migrate_folio, + .dirty_folio = btree_dirty_folio, +}; + +struct extent_buffer *btrfs_find_create_tree_block( + struct btrfs_fs_info *fs_info, + u64 bytenr, u64 owner_root, + int level) +{ + if (btrfs_is_testing(fs_info)) + return alloc_test_extent_buffer(fs_info, bytenr); + return alloc_extent_buffer(fs_info, bytenr, owner_root, level); +} + +/* + * Read tree block at logical address @bytenr and do variant basic but critical + * verification. + * + * @check: expected tree parentness check, see comments of the + * structure for details. + */ +struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, + struct btrfs_tree_parent_check *check) +{ + struct extent_buffer *buf = NULL; + int ret; + + ASSERT(check); + + buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root, + check->level); + if (IS_ERR(buf)) + return buf; + + ret = btrfs_read_extent_buffer(buf, check); + if (ret) { + free_extent_buffer_stale(buf); + return ERR_PTR(ret); + } + if (btrfs_check_eb_owner(buf, check->owner_root)) { + free_extent_buffer_stale(buf); + return ERR_PTR(-EUCLEAN); + } + return buf; + +} + +static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, + u64 objectid) +{ + bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + + memset(&root->root_key, 0, sizeof(root->root_key)); + memset(&root->root_item, 0, sizeof(root->root_item)); + memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); + root->fs_info = fs_info; + root->root_key.objectid = objectid; + root->node = NULL; + root->commit_root = NULL; + root->state = 0; + RB_CLEAR_NODE(&root->rb_node); + + root->last_trans = 0; + root->free_objectid = 0; + root->nr_delalloc_inodes = 0; + root->nr_ordered_extents = 0; + root->inode_tree = RB_ROOT; + INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); + + btrfs_init_root_block_rsv(root); + + INIT_LIST_HEAD(&root->dirty_list); + INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD(&root->delalloc_inodes); + INIT_LIST_HEAD(&root->delalloc_root); + INIT_LIST_HEAD(&root->ordered_extents); + INIT_LIST_HEAD(&root->ordered_root); + INIT_LIST_HEAD(&root->reloc_dirty_list); + INIT_LIST_HEAD(&root->logged_list[0]); + INIT_LIST_HEAD(&root->logged_list[1]); + spin_lock_init(&root->inode_lock); + spin_lock_init(&root->delalloc_lock); + spin_lock_init(&root->ordered_extent_lock); + spin_lock_init(&root->accounting_lock); + spin_lock_init(&root->log_extents_lock[0]); + spin_lock_init(&root->log_extents_lock[1]); + spin_lock_init(&root->qgroup_meta_rsv_lock); + mutex_init(&root->objectid_mutex); + mutex_init(&root->log_mutex); + mutex_init(&root->ordered_extent_mutex); + mutex_init(&root->delalloc_mutex); + init_waitqueue_head(&root->qgroup_flush_wait); + init_waitqueue_head(&root->log_writer_wait); + init_waitqueue_head(&root->log_commit_wait[0]); + init_waitqueue_head(&root->log_commit_wait[1]); + INIT_LIST_HEAD(&root->log_ctxs[0]); + INIT_LIST_HEAD(&root->log_ctxs[1]); + atomic_set(&root->log_commit[0], 0); + atomic_set(&root->log_commit[1], 0); + atomic_set(&root->log_writers, 0); + atomic_set(&root->log_batch, 0); + refcount_set(&root->refs, 1); + atomic_set(&root->snapshot_force_cow, 0); + atomic_set(&root->nr_swapfiles, 0); + root->log_transid = 0; + root->log_transid_committed = -1; + root->last_log_commit = 0; + root->anon_dev = 0; + if (!dummy) { + extent_io_tree_init(fs_info, &root->dirty_log_pages, + IO_TREE_ROOT_DIRTY_LOG_PAGES); + extent_io_tree_init(fs_info, &root->log_csum_range, + IO_TREE_LOG_CSUM_RANGE); + } + + spin_lock_init(&root->root_item_lock); + btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); +#ifdef CONFIG_BTRFS_DEBUG + INIT_LIST_HEAD(&root->leak_list); + spin_lock(&fs_info->fs_roots_radix_lock); + list_add_tail(&root->leak_list, &fs_info->allocated_roots); + spin_unlock(&fs_info->fs_roots_radix_lock); +#endif +} + +static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, + u64 objectid, gfp_t flags) +{ + struct btrfs_root *root = kzalloc(sizeof(*root), flags); + if (root) + __setup_root(root, fs_info, objectid); + return root; +} + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +/* Should only be used by the testing infrastructure */ +struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + + if (!fs_info) + return ERR_PTR(-EINVAL); + + root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL); + if (!root) + return ERR_PTR(-ENOMEM); + + /* We don't use the stripesize in selftest, set it as sectorsize */ + root->alloc_bytenr = 0; + + return root; +} +#endif + +static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node) +{ + const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node); + const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node); + + return btrfs_comp_cpu_keys(&a->root_key, &b->root_key); +} + +static int global_root_key_cmp(const void *k, const struct rb_node *node) +{ + const struct btrfs_key *key = k; + const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node); + + return btrfs_comp_cpu_keys(key, &root->root_key); +} + +int btrfs_global_root_insert(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct rb_node *tmp; + int ret = 0; + + write_lock(&fs_info->global_root_lock); + tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp); + write_unlock(&fs_info->global_root_lock); + + if (tmp) { + ret = -EEXIST; + btrfs_warn(fs_info, "global root %llu %llu already exists", + root->root_key.objectid, root->root_key.offset); + } + return ret; +} + +void btrfs_global_root_delete(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + write_lock(&fs_info->global_root_lock); + rb_erase(&root->rb_node, &fs_info->global_root_tree); + write_unlock(&fs_info->global_root_lock); +} + +struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *key) +{ + struct rb_node *node; + struct btrfs_root *root = NULL; + + read_lock(&fs_info->global_root_lock); + node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp); + if (node) + root = container_of(node, struct btrfs_root, rb_node); + read_unlock(&fs_info->global_root_lock); + + return root; +} + +static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group *block_group; + u64 ret; + + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return 0; + + if (bytenr) + block_group = btrfs_lookup_block_group(fs_info, bytenr); + else + block_group = btrfs_lookup_first_block_group(fs_info, bytenr); + ASSERT(block_group); + if (!block_group) + return 0; + ret = block_group->global_root_id; + btrfs_put_block_group(block_group); + + return ret; +} + +struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_key key = { + .objectid = BTRFS_CSUM_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = btrfs_global_root_id(fs_info, bytenr), + }; + + return btrfs_global_root(fs_info, &key); +} + +struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_key key = { + .objectid = BTRFS_EXTENT_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = btrfs_global_root_id(fs_info, bytenr), + }; + + return btrfs_global_root(fs_info, &key); +} + +struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) +{ + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) + return fs_info->block_group_root; + return btrfs_extent_root(fs_info, 0); +} + +struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, + u64 objectid) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct extent_buffer *leaf; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *root; + struct btrfs_key key; + unsigned int nofs_flag; + int ret = 0; + + /* + * We're holding a transaction handle, so use a NOFS memory allocation + * context to avoid deadlock if reclaim happens. + */ + nofs_flag = memalloc_nofs_save(); + root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + if (!root) + return ERR_PTR(-ENOMEM); + + root->root_key.objectid = objectid; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, + BTRFS_NESTING_NORMAL); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + leaf = NULL; + goto fail; + } + + root->node = leaf; + btrfs_mark_buffer_dirty(trans, leaf); + + root->commit_root = btrfs_root_node(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + + btrfs_set_root_flags(&root->root_item, 0); + btrfs_set_root_limit(&root->root_item, 0); + btrfs_set_root_bytenr(&root->root_item, leaf->start); + btrfs_set_root_generation(&root->root_item, trans->transid); + btrfs_set_root_level(&root->root_item, 0); + btrfs_set_root_refs(&root->root_item, 1); + btrfs_set_root_used(&root->root_item, leaf->len); + btrfs_set_root_last_snapshot(&root->root_item, 0); + btrfs_set_root_dirid(&root->root_item, 0); + if (is_fstree(objectid)) + generate_random_guid(root->root_item.uuid); + else + export_guid(root->root_item.uuid, &guid_null); + btrfs_set_root_drop_level(&root->root_item, 0); + + btrfs_tree_unlock(leaf); + + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; + ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item); + if (ret) + goto fail; + + return root; + +fail: + btrfs_put_root(root); + + return ERR_PTR(ret); +} + +static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + + root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS); + if (!root) + return ERR_PTR(-ENOMEM); + + root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + + return root; +} + +int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct extent_buffer *leaf; + + /* + * DON'T set SHAREABLE bit for log trees. + * + * Log trees are not exposed to user space thus can't be snapshotted, + * and they go away before a real commit is actually done. + * + * They do store pointers to file data extents, and those reference + * counts still get updated (along with back refs to the log tree). + */ + + leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, + NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); + if (IS_ERR(leaf)) + return PTR_ERR(leaf); + + root->node = leaf; + + btrfs_mark_buffer_dirty(trans, root->node); + btrfs_tree_unlock(root->node); + + return 0; +} + +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *log_root; + + log_root = alloc_log_tree(trans, fs_info); + if (IS_ERR(log_root)) + return PTR_ERR(log_root); + + if (!btrfs_is_zoned(fs_info)) { + int ret = btrfs_alloc_log_tree_node(trans, log_root); + + if (ret) { + btrfs_put_root(log_root); + return ret; + } + } + + WARN_ON(fs_info->log_root_tree); + fs_info->log_root_tree = log_root; + return 0; +} + +int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *log_root; + struct btrfs_inode_item *inode_item; + int ret; + + log_root = alloc_log_tree(trans, fs_info); + if (IS_ERR(log_root)) + return PTR_ERR(log_root); + + ret = btrfs_alloc_log_tree_node(trans, log_root); + if (ret) { + btrfs_put_root(log_root); + return ret; + } + + log_root->last_trans = trans->transid; + log_root->root_key.offset = root->root_key.objectid; + + inode_item = &log_root->root_item.inode; + btrfs_set_stack_inode_generation(inode_item, 1); + btrfs_set_stack_inode_size(inode_item, 3); + btrfs_set_stack_inode_nlink(inode_item, 1); + btrfs_set_stack_inode_nbytes(inode_item, + fs_info->nodesize); + btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); + + btrfs_set_root_node(&log_root->root_item, log_root->node); + + WARN_ON(root->log_root); + root->log_root = log_root; + root->log_transid = 0; + root->log_transid_committed = -1; + root->last_log_commit = 0; + return 0; +} + +static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, + struct btrfs_path *path, + struct btrfs_key *key) +{ + struct btrfs_root *root; + struct btrfs_tree_parent_check check = { 0 }; + struct btrfs_fs_info *fs_info = tree_root->fs_info; + u64 generation; + int ret; + int level; + + root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS); + if (!root) + return ERR_PTR(-ENOMEM); + + ret = btrfs_find_root(tree_root, key, path, + &root->root_item, &root->root_key); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto fail; + } + + generation = btrfs_root_generation(&root->root_item); + level = btrfs_root_level(&root->root_item); + check.level = level; + check.transid = generation; + check.owner_root = key->objectid; + root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item), + &check); + if (IS_ERR(root->node)) { + ret = PTR_ERR(root->node); + root->node = NULL; + goto fail; + } + if (!btrfs_buffer_uptodate(root->node, generation, 0)) { + ret = -EIO; + goto fail; + } + + /* + * For real fs, and not log/reloc trees, root owner must + * match its root node owner + */ + if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && + root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + root->root_key.objectid != btrfs_header_owner(root->node)) { + btrfs_crit(fs_info, +"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", + root->root_key.objectid, root->node->start, + btrfs_header_owner(root->node), + root->root_key.objectid); + ret = -EUCLEAN; + goto fail; + } + root->commit_root = btrfs_root_node(root); + return root; +fail: + btrfs_put_root(root); + return ERR_PTR(ret); +} + +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key) +{ + struct btrfs_root *root; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + root = read_tree_root_path(tree_root, path, key); + btrfs_free_path(path); + + return root; +} + +/* + * Initialize subvolume root in-memory structure + * + * @anon_dev: anonymous device to attach to the root, if zero, allocate new + */ +static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) +{ + int ret; + + btrfs_drew_lock_init(&root->snapshot_lock); + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && + !btrfs_is_data_reloc_root(root) && + is_fstree(root->root_key.objectid)) { + set_bit(BTRFS_ROOT_SHAREABLE, &root->state); + btrfs_check_and_init_root_item(&root->root_item); + } + + /* + * Don't assign anonymous block device to roots that are not exposed to + * userspace, the id pool is limited to 1M + */ + if (is_fstree(root->root_key.objectid) && + btrfs_root_refs(&root->root_item) > 0) { + if (!anon_dev) { + ret = get_anon_bdev(&root->anon_dev); + if (ret) + goto fail; + } else { + root->anon_dev = anon_dev; + } + } + + mutex_lock(&root->objectid_mutex); + ret = btrfs_init_root_free_objectid(root); + if (ret) { + mutex_unlock(&root->objectid_mutex); + goto fail; + } + + ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); + + mutex_unlock(&root->objectid_mutex); + + return 0; +fail: + /* The caller is responsible to call btrfs_free_fs_root */ + return ret; +} + +static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id) +{ + struct btrfs_root *root; + + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)root_id); + root = btrfs_grab_root(root); + spin_unlock(&fs_info->fs_roots_radix_lock); + return root; +} + +static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, + u64 objectid) +{ + struct btrfs_key key = { + .objectid = objectid, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + + switch (objectid) { + case BTRFS_ROOT_TREE_OBJECTID: + return btrfs_grab_root(fs_info->tree_root); + case BTRFS_EXTENT_TREE_OBJECTID: + return btrfs_grab_root(btrfs_global_root(fs_info, &key)); + case BTRFS_CHUNK_TREE_OBJECTID: + return btrfs_grab_root(fs_info->chunk_root); + case BTRFS_DEV_TREE_OBJECTID: + return btrfs_grab_root(fs_info->dev_root); + case BTRFS_CSUM_TREE_OBJECTID: + return btrfs_grab_root(btrfs_global_root(fs_info, &key)); + case BTRFS_QUOTA_TREE_OBJECTID: + return btrfs_grab_root(fs_info->quota_root); + case BTRFS_UUID_TREE_OBJECTID: + return btrfs_grab_root(fs_info->uuid_root); + case BTRFS_BLOCK_GROUP_TREE_OBJECTID: + return btrfs_grab_root(fs_info->block_group_root); + case BTRFS_FREE_SPACE_TREE_OBJECTID: + return btrfs_grab_root(btrfs_global_root(fs_info, &key)); + default: + return NULL; + } +} + +int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root) +{ + int ret; + + ret = radix_tree_preload(GFP_NOFS); + if (ret) + return ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); + if (ret == 0) { + btrfs_grab_root(root); + set_bit(BTRFS_ROOT_IN_RADIX, &root->state); + } + spin_unlock(&fs_info->fs_roots_radix_lock); + radix_tree_preload_end(); + + return ret; +} + +void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) +{ +#ifdef CONFIG_BTRFS_DEBUG + struct btrfs_root *root; + + while (!list_empty(&fs_info->allocated_roots)) { + char buf[BTRFS_ROOT_NAME_BUF_LEN]; + + root = list_first_entry(&fs_info->allocated_roots, + struct btrfs_root, leak_list); + btrfs_err(fs_info, "leaked root %s refcount %d", + btrfs_root_name(&root->root_key, buf), + refcount_read(&root->refs)); + while (refcount_read(&root->refs) > 1) + btrfs_put_root(root); + btrfs_put_root(root); + } +#endif +} + +static void free_global_roots(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct rb_node *node; + + while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) { + root = rb_entry(node, struct btrfs_root, rb_node); + rb_erase(&root->rb_node, &fs_info->global_root_tree); + btrfs_put_root(root); + } +} + +void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) +{ + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); + percpu_counter_destroy(&fs_info->delalloc_bytes); + percpu_counter_destroy(&fs_info->ordered_bytes); + percpu_counter_destroy(&fs_info->dev_replace.bio_counter); + btrfs_free_csum_hash(fs_info); + btrfs_free_stripe_hash_table(fs_info); + btrfs_free_ref_cache(fs_info); + kfree(fs_info->balance_ctl); + kfree(fs_info->delayed_root); + free_global_roots(fs_info); + btrfs_put_root(fs_info->tree_root); + btrfs_put_root(fs_info->chunk_root); + btrfs_put_root(fs_info->dev_root); + btrfs_put_root(fs_info->quota_root); + btrfs_put_root(fs_info->uuid_root); + btrfs_put_root(fs_info->fs_root); + btrfs_put_root(fs_info->data_reloc_root); + btrfs_put_root(fs_info->block_group_root); + btrfs_check_leaked_roots(fs_info); + btrfs_extent_buffer_leak_debug_check(fs_info); + kfree(fs_info->super_copy); + kfree(fs_info->super_for_commit); + kfree(fs_info->subpage_info); + kvfree(fs_info); +} + + +/* + * Get an in-memory reference of a root structure. + * + * For essential trees like root/extent tree, we grab it from fs_info directly. + * For subvolume trees, we check the cached filesystem roots first. If not + * found, then read it from disk and add it to cached fs roots. + * + * Caller should release the root by calling btrfs_put_root() after the usage. + * + * NOTE: Reloc and log trees can't be read by this function as they share the + * same root objectid. + * + * @objectid: root id + * @anon_dev: preallocated anonymous block device number for new roots, + * pass 0 for new allocation. + * @check_ref: whether to check root item references, If true, return -ENOENT + * for orphan roots + */ +static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, + u64 objectid, dev_t anon_dev, + bool check_ref) +{ + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + root = btrfs_get_global_root(fs_info, objectid); + if (root) + return root; + + /* + * If we're called for non-subvolume trees, and above function didn't + * find one, do not try to read it from disk. + * + * This is namely for free-space-tree and quota tree, which can change + * at runtime and should only be grabbed from fs_info. + */ + if (!is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) + return ERR_PTR(-ENOENT); +again: + root = btrfs_lookup_fs_root(fs_info, objectid); + if (root) { + /* Shouldn't get preallocated anon_dev for cached roots */ + ASSERT(!anon_dev); + if (check_ref && btrfs_root_refs(&root->root_item) == 0) { + btrfs_put_root(root); + return ERR_PTR(-ENOENT); + } + return root; + } + + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = btrfs_read_tree_root(fs_info->tree_root, &key); + if (IS_ERR(root)) + return root; + + if (check_ref && btrfs_root_refs(&root->root_item) == 0) { + ret = -ENOENT; + goto fail; + } + + ret = btrfs_init_fs_root(root, anon_dev); + if (ret) + goto fail; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto fail; + } + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = objectid; + + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); + btrfs_free_path(path); + if (ret < 0) + goto fail; + if (ret == 0) + set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); + + ret = btrfs_insert_fs_root(fs_info, root); + if (ret) { + if (ret == -EEXIST) { + btrfs_put_root(root); + goto again; + } + goto fail; + } + return root; +fail: + /* + * If our caller provided us an anonymous device, then it's his + * responsibility to free it in case we fail. So we have to set our + * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root() + * and once again by our caller. + */ + if (anon_dev) + root->anon_dev = 0; + btrfs_put_root(root); + return ERR_PTR(ret); +} + +/* + * Get in-memory reference of a root structure + * + * @objectid: tree objectid + * @check_ref: if set, verify that the tree exists and the item has at least + * one reference + */ +struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, + u64 objectid, bool check_ref) +{ + return btrfs_get_root_ref(fs_info, objectid, 0, check_ref); +} + +/* + * Get in-memory reference of a root structure, created as new, optionally pass + * the anonymous block device id + * + * @objectid: tree objectid + * @anon_dev: if zero, allocate a new anonymous block device or use the + * parameter value + */ +struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, + u64 objectid, dev_t anon_dev) +{ + return btrfs_get_root_ref(fs_info, objectid, anon_dev, true); +} + +/* + * btrfs_get_fs_root_commit_root - return a root for the given objectid + * @fs_info: the fs_info + * @objectid: the objectid we need to lookup + * + * This is exclusively used for backref walking, and exists specifically because + * of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref + * creation time, which means we may have to read the tree_root in order to look + * up a fs root that is not in memory. If the root is not in memory we will + * read the tree root commit root and look up the fs root from there. This is a + * temporary root, it will not be inserted into the radix tree as it doesn't + * have the most uptodate information, it'll simply be discarded once the + * backref code is finished using the root. + */ +struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + u64 objectid) +{ + struct btrfs_root *root; + struct btrfs_key key; + + ASSERT(path->search_commit_root && path->skip_locking); + + /* + * This can return -ENOENT if we ask for a root that doesn't exist, but + * since this is called via the backref walking code we won't be looking + * up a root that doesn't exist, unless there's corruption. So if root + * != NULL just return it. + */ + root = btrfs_get_global_root(fs_info, objectid); + if (root) + return root; + + root = btrfs_lookup_fs_root(fs_info, objectid); + if (root) + return root; + + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = read_tree_root_path(fs_info->tree_root, path, &key); + btrfs_release_path(path); + + return root; +} + +static int cleaner_kthread(void *arg) +{ + struct btrfs_fs_info *fs_info = arg; + int again; + + while (1) { + again = 0; + + set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); + + /* Make the cleaner go to sleep early. */ + if (btrfs_need_cleaner_sleep(fs_info)) + goto sleep; + + /* + * Do not do anything if we might cause open_ctree() to block + * before we have finished mounting the filesystem. + */ + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) + goto sleep; + + if (!mutex_trylock(&fs_info->cleaner_mutex)) + goto sleep; + + /* + * Avoid the problem that we change the status of the fs + * during the above check and trylock. + */ + if (btrfs_need_cleaner_sleep(fs_info)) { + mutex_unlock(&fs_info->cleaner_mutex); + goto sleep; + } + + if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags)) + btrfs_sysfs_feature_update(fs_info); + + btrfs_run_delayed_iputs(fs_info); + + again = btrfs_clean_one_deleted_snapshot(fs_info); + mutex_unlock(&fs_info->cleaner_mutex); + + /* + * The defragger has dealt with the R/O remount and umount, + * needn't do anything special here. + */ + btrfs_run_defrag_inodes(fs_info); + + /* + * Acquires fs_info->reclaim_bgs_lock to avoid racing + * with relocation (btrfs_relocate_chunk) and relocation + * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group) + * after acquiring fs_info->reclaim_bgs_lock. So we + * can't hold, nor need to, fs_info->cleaner_mutex when deleting + * unused block groups. + */ + btrfs_delete_unused_bgs(fs_info); + + /* + * Reclaim block groups in the reclaim_bgs list after we deleted + * all unused block_groups. This possibly gives us some more free + * space. + */ + btrfs_reclaim_bgs(fs_info); +sleep: + clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); + if (kthread_should_park()) + kthread_parkme(); + if (kthread_should_stop()) + return 0; + if (!again) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + } + } +} + +static int transaction_kthread(void *arg) +{ + struct btrfs_root *root = arg; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_trans_handle *trans; + struct btrfs_transaction *cur; + u64 transid; + time64_t delta; + unsigned long delay; + bool cannot_commit; + + do { + cannot_commit = false; + delay = msecs_to_jiffies(fs_info->commit_interval * 1000); + mutex_lock(&fs_info->transaction_kthread_mutex); + + spin_lock(&fs_info->trans_lock); + cur = fs_info->running_transaction; + if (!cur) { + spin_unlock(&fs_info->trans_lock); + goto sleep; + } + + delta = ktime_get_seconds() - cur->start_time; + if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) && + cur->state < TRANS_STATE_COMMIT_PREP && + delta < fs_info->commit_interval) { + spin_unlock(&fs_info->trans_lock); + delay -= msecs_to_jiffies((delta - 1) * 1000); + delay = min(delay, + msecs_to_jiffies(fs_info->commit_interval * 1000)); + goto sleep; + } + transid = cur->transid; + spin_unlock(&fs_info->trans_lock); + + /* If the file system is aborted, this will always fail. */ + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) + cannot_commit = true; + goto sleep; + } + if (transid == trans->transid) { + btrfs_commit_transaction(trans); + } else { + btrfs_end_transaction(trans); + } +sleep: + wake_up_process(fs_info->cleaner_kthread); + mutex_unlock(&fs_info->transaction_kthread_mutex); + + if (BTRFS_FS_ERROR(fs_info)) + btrfs_cleanup_transaction(fs_info); + if (!kthread_should_stop() && + (!btrfs_transaction_blocked(fs_info) || + cannot_commit)) + schedule_timeout_interruptible(delay); + } while (!kthread_should_stop()); + return 0; +} + +/* + * This will find the highest generation in the array of root backups. The + * index of the highest array is returned, or -EINVAL if we can't find + * anything. + * + * We check to make sure the array is valid by comparing the + * generation of the latest root in the array with the generation + * in the super block. If they don't match we pitch it. + */ +static int find_newest_super_backup(struct btrfs_fs_info *info) +{ + const u64 newest_gen = btrfs_super_generation(info->super_copy); + u64 cur; + struct btrfs_root_backup *root_backup; + int i; + + for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { + root_backup = info->super_copy->super_roots + i; + cur = btrfs_backup_tree_root_gen(root_backup); + if (cur == newest_gen) + return i; + } + + return -EINVAL; +} + +/* + * copy all the root pointers into the super backup array. + * this will bump the backup pointer by one when it is + * done + */ +static void backup_super_roots(struct btrfs_fs_info *info) +{ + const int next_backup = info->backup_root_index; + struct btrfs_root_backup *root_backup; + + root_backup = info->super_for_commit->super_roots + next_backup; + + /* + * make sure all of our padding and empty slots get zero filled + * regardless of which ones we use today + */ + memset(root_backup, 0, sizeof(*root_backup)); + + info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS; + + btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start); + btrfs_set_backup_tree_root_gen(root_backup, + btrfs_header_generation(info->tree_root->node)); + + btrfs_set_backup_tree_root_level(root_backup, + btrfs_header_level(info->tree_root->node)); + + btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start); + btrfs_set_backup_chunk_root_gen(root_backup, + btrfs_header_generation(info->chunk_root->node)); + btrfs_set_backup_chunk_root_level(root_backup, + btrfs_header_level(info->chunk_root->node)); + + if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) { + struct btrfs_root *extent_root = btrfs_extent_root(info, 0); + struct btrfs_root *csum_root = btrfs_csum_root(info, 0); + + btrfs_set_backup_extent_root(root_backup, + extent_root->node->start); + btrfs_set_backup_extent_root_gen(root_backup, + btrfs_header_generation(extent_root->node)); + btrfs_set_backup_extent_root_level(root_backup, + btrfs_header_level(extent_root->node)); + + btrfs_set_backup_csum_root(root_backup, csum_root->node->start); + btrfs_set_backup_csum_root_gen(root_backup, + btrfs_header_generation(csum_root->node)); + btrfs_set_backup_csum_root_level(root_backup, + btrfs_header_level(csum_root->node)); + } + + /* + * we might commit during log recovery, which happens before we set + * the fs_root. Make sure it is valid before we fill it in. + */ + if (info->fs_root && info->fs_root->node) { + btrfs_set_backup_fs_root(root_backup, + info->fs_root->node->start); + btrfs_set_backup_fs_root_gen(root_backup, + btrfs_header_generation(info->fs_root->node)); + btrfs_set_backup_fs_root_level(root_backup, + btrfs_header_level(info->fs_root->node)); + } + + btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start); + btrfs_set_backup_dev_root_gen(root_backup, + btrfs_header_generation(info->dev_root->node)); + btrfs_set_backup_dev_root_level(root_backup, + btrfs_header_level(info->dev_root->node)); + + btrfs_set_backup_total_bytes(root_backup, + btrfs_super_total_bytes(info->super_copy)); + btrfs_set_backup_bytes_used(root_backup, + btrfs_super_bytes_used(info->super_copy)); + btrfs_set_backup_num_devices(root_backup, + btrfs_super_num_devices(info->super_copy)); + + /* + * if we don't copy this out to the super_copy, it won't get remembered + * for the next commit + */ + memcpy(&info->super_copy->super_roots, + &info->super_for_commit->super_roots, + sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS); +} + +/* + * read_backup_root - Reads a backup root based on the passed priority. Prio 0 + * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots + * + * fs_info - filesystem whose backup roots need to be read + * priority - priority of backup root required + * + * Returns backup root index on success and -EINVAL otherwise. + */ +static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority) +{ + int backup_index = find_newest_super_backup(fs_info); + struct btrfs_super_block *super = fs_info->super_copy; + struct btrfs_root_backup *root_backup; + + if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) { + if (priority == 0) + return backup_index; + + backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority; + backup_index %= BTRFS_NUM_BACKUP_ROOTS; + } else { + return -EINVAL; + } + + root_backup = super->super_roots + backup_index; + + btrfs_set_super_generation(super, + btrfs_backup_tree_root_gen(root_backup)); + btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup)); + btrfs_set_super_root_level(super, + btrfs_backup_tree_root_level(root_backup)); + btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup)); + + /* + * Fixme: the total bytes and num_devices need to match or we should + * need a fsck + */ + btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup)); + btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup)); + + return backup_index; +} + +/* helper to cleanup workers */ +static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) +{ + btrfs_destroy_workqueue(fs_info->fixup_workers); + btrfs_destroy_workqueue(fs_info->delalloc_workers); + btrfs_destroy_workqueue(fs_info->workers); + if (fs_info->endio_workers) + destroy_workqueue(fs_info->endio_workers); + if (fs_info->rmw_workers) + destroy_workqueue(fs_info->rmw_workers); + if (fs_info->compressed_write_workers) + destroy_workqueue(fs_info->compressed_write_workers); + btrfs_destroy_workqueue(fs_info->endio_write_workers); + btrfs_destroy_workqueue(fs_info->endio_freespace_worker); + btrfs_destroy_workqueue(fs_info->delayed_workers); + btrfs_destroy_workqueue(fs_info->caching_workers); + btrfs_destroy_workqueue(fs_info->flush_workers); + btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); + if (fs_info->discard_ctl.discard_workers) + destroy_workqueue(fs_info->discard_ctl.discard_workers); + /* + * Now that all other work queues are destroyed, we can safely destroy + * the queues used for metadata I/O, since tasks from those other work + * queues can do metadata I/O operations. + */ + if (fs_info->endio_meta_workers) + destroy_workqueue(fs_info->endio_meta_workers); +} + +static void free_root_extent_buffers(struct btrfs_root *root) +{ + if (root) { + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + root->node = NULL; + root->commit_root = NULL; + } +} + +static void free_global_root_pointers(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root, *tmp; + + rbtree_postorder_for_each_entry_safe(root, tmp, + &fs_info->global_root_tree, + rb_node) + free_root_extent_buffers(root); +} + +/* helper to cleanup tree roots */ +static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) +{ + free_root_extent_buffers(info->tree_root); + + free_global_root_pointers(info); + free_root_extent_buffers(info->dev_root); + free_root_extent_buffers(info->quota_root); + free_root_extent_buffers(info->uuid_root); + free_root_extent_buffers(info->fs_root); + free_root_extent_buffers(info->data_reloc_root); + free_root_extent_buffers(info->block_group_root); + if (free_chunk_root) + free_root_extent_buffers(info->chunk_root); +} + +void btrfs_put_root(struct btrfs_root *root) +{ + if (!root) + return; + + if (refcount_dec_and_test(&root->refs)) { + WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); + if (root->anon_dev) + free_anon_bdev(root->anon_dev); + free_root_extent_buffers(root); +#ifdef CONFIG_BTRFS_DEBUG + spin_lock(&root->fs_info->fs_roots_radix_lock); + list_del_init(&root->leak_list); + spin_unlock(&root->fs_info->fs_roots_radix_lock); +#endif + kfree(root); + } +} + +void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) +{ + int ret; + struct btrfs_root *gang[8]; + int i; + + while (!list_empty(&fs_info->dead_roots)) { + gang[0] = list_entry(fs_info->dead_roots.next, + struct btrfs_root, root_list); + list_del(&gang[0]->root_list); + + if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) + btrfs_drop_and_free_fs_root(fs_info, gang[0]); + btrfs_put_root(gang[0]); + } + + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) + btrfs_drop_and_free_fs_root(fs_info, gang[i]); + } +} + +static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) +{ + mutex_init(&fs_info->scrub_lock); + atomic_set(&fs_info->scrubs_running, 0); + atomic_set(&fs_info->scrub_pause_req, 0); + atomic_set(&fs_info->scrubs_paused, 0); + atomic_set(&fs_info->scrub_cancel_req, 0); + init_waitqueue_head(&fs_info->scrub_pause_wait); + refcount_set(&fs_info->scrub_workers_refcnt, 0); +} + +static void btrfs_init_balance(struct btrfs_fs_info *fs_info) +{ + spin_lock_init(&fs_info->balance_lock); + mutex_init(&fs_info->balance_mutex); + atomic_set(&fs_info->balance_pause_req, 0); + atomic_set(&fs_info->balance_cancel_req, 0); + fs_info->balance_ctl = NULL; + init_waitqueue_head(&fs_info->balance_wait_q); + atomic_set(&fs_info->reloc_cancel_req, 0); +} + +static int btrfs_init_btree_inode(struct super_block *sb) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID, + fs_info->tree_root); + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return -ENOMEM; + + inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; + set_nlink(inode, 1); + /* + * we set the i_size on the btree inode to the max possible int. + * the real end of the address space is determined by all of + * the devices in the system + */ + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &btree_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + + RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, + IO_TREE_BTREE_INODE_IO); + extent_map_tree_init(&BTRFS_I(inode)->extent_tree); + + BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); + BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID; + BTRFS_I(inode)->location.type = 0; + BTRFS_I(inode)->location.offset = 0; + set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); + __insert_inode_hash(inode, hash); + fs_info->btree_inode = inode; + + return 0; +} + +static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) +{ + mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); + init_rwsem(&fs_info->dev_replace.rwsem); + init_waitqueue_head(&fs_info->dev_replace.replace_wait); +} + +static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) +{ + spin_lock_init(&fs_info->qgroup_lock); + mutex_init(&fs_info->qgroup_ioctl_lock); + fs_info->qgroup_tree = RB_ROOT; + INIT_LIST_HEAD(&fs_info->dirty_qgroups); + fs_info->qgroup_seq = 1; + fs_info->qgroup_ulist = NULL; + fs_info->qgroup_rescan_running = false; + fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; + mutex_init(&fs_info->qgroup_rescan_lock); +} + +static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) +{ + u32 max_active = fs_info->thread_pool_size; + unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; + unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE; + + fs_info->workers = + btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); + + fs_info->delalloc_workers = + btrfs_alloc_workqueue(fs_info, "delalloc", + flags, max_active, 2); + + fs_info->flush_workers = + btrfs_alloc_workqueue(fs_info, "flush_delalloc", + flags, max_active, 0); + + fs_info->caching_workers = + btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0); + + fs_info->fixup_workers = + btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags); + + fs_info->endio_workers = + alloc_workqueue("btrfs-endio", flags, max_active); + fs_info->endio_meta_workers = + alloc_workqueue("btrfs-endio-meta", flags, max_active); + fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); + fs_info->endio_write_workers = + btrfs_alloc_workqueue(fs_info, "endio-write", flags, + max_active, 2); + fs_info->compressed_write_workers = + alloc_workqueue("btrfs-compressed-write", flags, max_active); + fs_info->endio_freespace_worker = + btrfs_alloc_workqueue(fs_info, "freespace-write", flags, + max_active, 0); + fs_info->delayed_workers = + btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, + max_active, 0); + fs_info->qgroup_rescan_workers = + btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan", + ordered_flags); + fs_info->discard_ctl.discard_workers = + alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE); + + if (!(fs_info->workers && + fs_info->delalloc_workers && fs_info->flush_workers && + fs_info->endio_workers && fs_info->endio_meta_workers && + fs_info->compressed_write_workers && + fs_info->endio_write_workers && + fs_info->endio_freespace_worker && fs_info->rmw_workers && + fs_info->caching_workers && fs_info->fixup_workers && + fs_info->delayed_workers && fs_info->qgroup_rescan_workers && + fs_info->discard_ctl.discard_workers)) { + return -ENOMEM; + } + + return 0; +} + +static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) +{ + struct crypto_shash *csum_shash; + const char *csum_driver = btrfs_super_csum_driver(csum_type); + + csum_shash = crypto_alloc_shash(csum_driver, 0, 0); + + if (IS_ERR(csum_shash)) { + btrfs_err(fs_info, "error allocating %s hash for checksum", + csum_driver); + return PTR_ERR(csum_shash); + } + + fs_info->csum_shash = csum_shash; + + /* + * Check if the checksum implementation is a fast accelerated one. + * As-is this is a bit of a hack and should be replaced once the csum + * implementations provide that information themselves. + */ + switch (csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + if (!strstr(crypto_shash_driver_name(csum_shash), "generic")) + set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); + break; + case BTRFS_CSUM_TYPE_XXHASH: + set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); + break; + default: + break; + } + + btrfs_info(fs_info, "using %s (%s) checksum algorithm", + btrfs_super_csum_name(csum_type), + crypto_shash_driver_name(csum_shash)); + return 0; +} + +static int btrfs_replay_log(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices) +{ + int ret; + struct btrfs_tree_parent_check check = { 0 }; + struct btrfs_root *log_tree_root; + struct btrfs_super_block *disk_super = fs_info->super_copy; + u64 bytenr = btrfs_super_log_root(disk_super); + int level = btrfs_super_log_root_level(disk_super); + + if (fs_devices->rw_devices == 0) { + btrfs_warn(fs_info, "log replay required on RO media"); + return -EIO; + } + + log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, + GFP_KERNEL); + if (!log_tree_root) + return -ENOMEM; + + check.level = level; + check.transid = fs_info->generation + 1; + check.owner_root = BTRFS_TREE_LOG_OBJECTID; + log_tree_root->node = read_tree_block(fs_info, bytenr, &check); + if (IS_ERR(log_tree_root->node)) { + btrfs_warn(fs_info, "failed to read log tree"); + ret = PTR_ERR(log_tree_root->node); + log_tree_root->node = NULL; + btrfs_put_root(log_tree_root); + return ret; + } + if (!extent_buffer_uptodate(log_tree_root->node)) { + btrfs_err(fs_info, "failed to read log tree"); + btrfs_put_root(log_tree_root); + return -EIO; + } + + /* returns with log_tree_root freed on success */ + ret = btrfs_recover_log_trees(log_tree_root); + if (ret) { + btrfs_handle_fs_error(fs_info, ret, + "Failed to recover log tree"); + btrfs_put_root(log_tree_root); + return ret; + } + + if (sb_rdonly(fs_info->sb)) { + ret = btrfs_commit_super(fs_info); + if (ret) + return ret; + } + + return 0; +} + +static int load_global_roots_objectid(struct btrfs_root *tree_root, + struct btrfs_path *path, u64 objectid, + const char *name) +{ + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_root *root; + u64 max_global_id = 0; + int ret; + struct btrfs_key key = { + .objectid = objectid, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + bool found = false; + + /* If we have IGNOREDATACSUMS skip loading these roots. */ + if (objectid == BTRFS_CSUM_TREE_OBJECTID && + btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { + set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + return 0; + } + + while (1) { + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + if (ret < 0) + break; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(tree_root, path); + if (ret) { + if (ret > 0) + ret = 0; + break; + } + } + ret = 0; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != objectid) + break; + btrfs_release_path(path); + + /* + * Just worry about this for extent tree, it'll be the same for + * everybody. + */ + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + max_global_id = max(max_global_id, key.offset); + + found = true; + root = read_tree_root_path(tree_root, path, &key); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) + ret = PTR_ERR(root); + break; + } + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + ret = btrfs_global_root_insert(root); + if (ret) { + btrfs_put_root(root); + break; + } + key.offset++; + } + btrfs_release_path(path); + + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + fs_info->nr_global_roots = max_global_id + 1; + + if (!found || ret) { + if (objectid == BTRFS_CSUM_TREE_OBJECTID) + set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) + ret = ret ? ret : -ENOENT; + else + ret = 0; + btrfs_err(fs_info, "failed to load root %s", name); + } + return ret; +} + +static int load_global_roots(struct btrfs_root *tree_root) +{ + struct btrfs_path *path; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = load_global_roots_objectid(tree_root, path, + BTRFS_EXTENT_TREE_OBJECTID, "extent"); + if (ret) + goto out; + ret = load_global_roots_objectid(tree_root, path, + BTRFS_CSUM_TREE_OBJECTID, "csum"); + if (ret) + goto out; + if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) + goto out; + ret = load_global_roots_objectid(tree_root, path, + BTRFS_FREE_SPACE_TREE_OBJECTID, + "free space"); +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_read_roots(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *root; + struct btrfs_key location; + int ret; + + BUG_ON(!fs_info->tree_root); + + ret = load_global_roots(tree_root); + if (ret) + return ret; + + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = 0; + + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) { + location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->block_group_root = root; + } + } + + location.objectid = BTRFS_DEV_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->dev_root = root; + } + /* Initialize fs_info for all devices in any case */ + ret = btrfs_init_devices_late(fs_info); + if (ret) + goto out; + + /* + * This tree can share blocks with some other fs tree during relocation + * and we need a proper setup by btrfs_get_fs_root + */ + root = btrfs_get_fs_root(tree_root->fs_info, + BTRFS_DATA_RELOC_TREE_OBJECTID, true); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->data_reloc_root = root; + } + + location.objectid = BTRFS_QUOTA_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (!IS_ERR(root)) { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + fs_info->quota_root = root; + } + + location.objectid = BTRFS_UUID_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + if (ret != -ENOENT) + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->uuid_root = root; + } + + return 0; +out: + btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d", + location.objectid, ret); + return ret; +} + +/* + * Real super block validation + * NOTE: super csum type and incompat features will not be checked here. + * + * @sb: super block to check + * @mirror_num: the super block number to check its bytenr: + * 0 the primary (1st) sb + * 1, 2 2nd and 3rd backup copy + * -1 skip bytenr check + */ +int btrfs_validate_super(struct btrfs_fs_info *fs_info, + struct btrfs_super_block *sb, int mirror_num) +{ + u64 nodesize = btrfs_super_nodesize(sb); + u64 sectorsize = btrfs_super_sectorsize(sb); + int ret = 0; + + if (btrfs_super_magic(sb) != BTRFS_MAGIC) { + btrfs_err(fs_info, "no valid FS found"); + ret = -EINVAL; + } + if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) { + btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu", + btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); + ret = -EINVAL; + } + if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { + btrfs_err(fs_info, "tree_root level too big: %d >= %d", + btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) { + btrfs_err(fs_info, "chunk_root level too big: %d >= %d", + btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) { + btrfs_err(fs_info, "log_root level too big: %d >= %d", + btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + + /* + * Check sectorsize and nodesize first, other check will need it. + * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. + */ + if (!is_power_of_2(sectorsize) || sectorsize < 4096 || + sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { + btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); + ret = -EINVAL; + } + + /* + * We only support at most two sectorsizes: 4K and PAGE_SIZE. + * + * We can support 16K sectorsize with 64K page size without problem, + * but such sectorsize/pagesize combination doesn't make much sense. + * 4K will be our future standard, PAGE_SIZE is supported from the very + * beginning. + */ + if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) { + btrfs_err(fs_info, + "sectorsize %llu not yet supported for page size %lu", + sectorsize, PAGE_SIZE); + ret = -EINVAL; + } + + if (!is_power_of_2(nodesize) || nodesize < sectorsize || + nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { + btrfs_err(fs_info, "invalid nodesize %llu", nodesize); + ret = -EINVAL; + } + if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { + btrfs_err(fs_info, "invalid leafsize %u, should be %llu", + le32_to_cpu(sb->__unused_leafsize), nodesize); + ret = -EINVAL; + } + + /* Root alignment check */ + if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { + btrfs_warn(fs_info, "tree_root block unaligned: %llu", + btrfs_super_root(sb)); + ret = -EINVAL; + } + if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { + btrfs_warn(fs_info, "chunk_root block unaligned: %llu", + btrfs_super_chunk_root(sb)); + ret = -EINVAL; + } + if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { + btrfs_warn(fs_info, "log_root block unaligned: %llu", + btrfs_super_log_root(sb)); + ret = -EINVAL; + } + + if (memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) { + btrfs_err(fs_info, + "superblock fsid doesn't match fsid of fs_devices: %pU != %pU", + sb->fsid, fs_info->fs_devices->fsid); + ret = -EINVAL; + } + + if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb), + BTRFS_FSID_SIZE) != 0) { + btrfs_err(fs_info, +"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU", + btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid); + ret = -EINVAL; + } + + if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, + BTRFS_FSID_SIZE) != 0) { + btrfs_err(fs_info, + "dev_item UUID does not match metadata fsid: %pU != %pU", + fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid); + ret = -EINVAL; + } + + /* + * Artificial requirement for block-group-tree to force newer features + * (free-space-tree, no-holes) so the test matrix is smaller. + */ + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) && + (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || + !btrfs_fs_incompat(fs_info, NO_HOLES))) { + btrfs_err(fs_info, + "block-group-tree feature requires fres-space-tree and no-holes"); + ret = -EINVAL; + } + + /* + * Hint to catch really bogus numbers, bitflips or so, more exact checks are + * done later + */ + if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { + btrfs_err(fs_info, "bytes_used is too small %llu", + btrfs_super_bytes_used(sb)); + ret = -EINVAL; + } + if (!is_power_of_2(btrfs_super_stripesize(sb))) { + btrfs_err(fs_info, "invalid stripesize %u", + btrfs_super_stripesize(sb)); + ret = -EINVAL; + } + if (btrfs_super_num_devices(sb) > (1UL << 31)) + btrfs_warn(fs_info, "suspicious number of devices: %llu", + btrfs_super_num_devices(sb)); + if (btrfs_super_num_devices(sb) == 0) { + btrfs_err(fs_info, "number of devices is 0"); + ret = -EINVAL; + } + + if (mirror_num >= 0 && + btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) { + btrfs_err(fs_info, "super offset mismatch %llu != %u", + btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); + ret = -EINVAL; + } + + /* + * Obvious sys_chunk_array corruptions, it must hold at least one key + * and one chunk + */ + if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + btrfs_err(fs_info, "system chunk array too big %u > %u", + btrfs_super_sys_array_size(sb), + BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); + ret = -EINVAL; + } + if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)) { + btrfs_err(fs_info, "system chunk array too small %u < %zu", + btrfs_super_sys_array_size(sb), + sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)); + ret = -EINVAL; + } + + /* + * The generation is a global counter, we'll trust it more than the others + * but it's still possible that it's the one that's wrong. + */ + if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb)) + btrfs_warn(fs_info, + "suspicious: generation < chunk_root_generation: %llu < %llu", + btrfs_super_generation(sb), + btrfs_super_chunk_root_generation(sb)); + if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) + && btrfs_super_cache_generation(sb) != (u64)-1) + btrfs_warn(fs_info, + "suspicious: generation < cache_generation: %llu < %llu", + btrfs_super_generation(sb), + btrfs_super_cache_generation(sb)); + + return ret; +} + +/* + * Validation of super block at mount time. + * Some checks already done early at mount time, like csum type and incompat + * flags will be skipped. + */ +static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info) +{ + return btrfs_validate_super(fs_info, fs_info->super_copy, 0); +} + +/* + * Validation of super block at write time. + * Some checks like bytenr check will be skipped as their values will be + * overwritten soon. + * Extra checks like csum type and incompat flags will be done here. + */ +static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, + struct btrfs_super_block *sb) +{ + int ret; + + ret = btrfs_validate_super(fs_info, sb, -1); + if (ret < 0) + goto out; + if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) { + ret = -EUCLEAN; + btrfs_err(fs_info, "invalid csum type, has %u want %u", + btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); + goto out; + } + if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) { + ret = -EUCLEAN; + btrfs_err(fs_info, + "invalid incompat flags, has 0x%llx valid mask 0x%llx", + btrfs_super_incompat_flags(sb), + (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP); + goto out; + } +out: + if (ret < 0) + btrfs_err(fs_info, + "super block corruption detected before writing it to disk"); + return ret; +} + +static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level) +{ + struct btrfs_tree_parent_check check = { + .level = level, + .transid = gen, + .owner_root = root->root_key.objectid + }; + int ret = 0; + + root->node = read_tree_block(root->fs_info, bytenr, &check); + if (IS_ERR(root->node)) { + ret = PTR_ERR(root->node); + root->node = NULL; + return ret; + } + if (!extent_buffer_uptodate(root->node)) { + free_extent_buffer(root->node); + root->node = NULL; + return -EIO; + } + + btrfs_set_root_node(&root->root_item, root->node); + root->commit_root = btrfs_root_node(root); + btrfs_set_root_refs(&root->root_item, 1); + return ret; +} + +static int load_important_roots(struct btrfs_fs_info *fs_info) +{ + struct btrfs_super_block *sb = fs_info->super_copy; + u64 gen, bytenr; + int level, ret; + + bytenr = btrfs_super_root(sb); + gen = btrfs_super_generation(sb); + level = btrfs_super_root_level(sb); + ret = load_super_root(fs_info->tree_root, bytenr, gen, level); + if (ret) { + btrfs_warn(fs_info, "couldn't read tree root"); + return ret; + } + return 0; +} + +static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) +{ + int backup_index = find_newest_super_backup(fs_info); + struct btrfs_super_block *sb = fs_info->super_copy; + struct btrfs_root *tree_root = fs_info->tree_root; + bool handle_error = false; + int ret = 0; + int i; + + for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { + if (handle_error) { + if (!IS_ERR(tree_root->node)) + free_extent_buffer(tree_root->node); + tree_root->node = NULL; + + if (!btrfs_test_opt(fs_info, USEBACKUPROOT)) + break; + + free_root_pointers(fs_info, 0); + + /* + * Don't use the log in recovery mode, it won't be + * valid + */ + btrfs_set_super_log_root(sb, 0); + + /* We can't trust the free space cache either */ + btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); + + btrfs_warn(fs_info, "try to load backup roots slot %d", i); + ret = read_backup_root(fs_info, i); + backup_index = ret; + if (ret < 0) + return ret; + } + + ret = load_important_roots(fs_info); + if (ret) { + handle_error = true; + continue; + } + + /* + * No need to hold btrfs_root::objectid_mutex since the fs + * hasn't been fully initialised and we are the only user + */ + ret = btrfs_init_root_free_objectid(tree_root); + if (ret < 0) { + handle_error = true; + continue; + } + + ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); + + ret = btrfs_read_roots(fs_info); + if (ret < 0) { + handle_error = true; + continue; + } + + /* All successful */ + fs_info->generation = btrfs_header_generation(tree_root->node); + fs_info->last_trans_committed = fs_info->generation; + fs_info->last_reloc_trans = 0; + + /* Always begin writing backup roots after the one being used */ + if (backup_index < 0) { + fs_info->backup_root_index = 0; + } else { + fs_info->backup_root_index = backup_index + 1; + fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS; + } + break; + } + + return ret; +} + +void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) +{ + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + INIT_LIST_HEAD(&fs_info->trans_list); + INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->delayed_iputs); + INIT_LIST_HEAD(&fs_info->delalloc_roots); + INIT_LIST_HEAD(&fs_info->caching_block_groups); + spin_lock_init(&fs_info->delalloc_root_lock); + spin_lock_init(&fs_info->trans_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); + spin_lock_init(&fs_info->delayed_iput_lock); + spin_lock_init(&fs_info->defrag_inodes_lock); + spin_lock_init(&fs_info->super_lock); + spin_lock_init(&fs_info->buffer_lock); + spin_lock_init(&fs_info->unused_bgs_lock); + spin_lock_init(&fs_info->treelog_bg_lock); + spin_lock_init(&fs_info->zone_active_bgs_lock); + spin_lock_init(&fs_info->relocation_bg_lock); + rwlock_init(&fs_info->tree_mod_log_lock); + rwlock_init(&fs_info->global_root_lock); + mutex_init(&fs_info->unused_bg_unpin_mutex); + mutex_init(&fs_info->reclaim_bgs_lock); + mutex_init(&fs_info->reloc_mutex); + mutex_init(&fs_info->delalloc_root_mutex); + mutex_init(&fs_info->zoned_meta_io_lock); + mutex_init(&fs_info->zoned_data_reloc_io_lock); + seqlock_init(&fs_info->profiles_lock); + + btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers); + btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters); + btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered); + btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_prep, + BTRFS_LOCKDEP_TRANS_COMMIT_PREP); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked, + BTRFS_LOCKDEP_TRANS_UNBLOCKED); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed, + BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed, + BTRFS_LOCKDEP_TRANS_COMPLETED); + + INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); + INIT_LIST_HEAD(&fs_info->space_info); + INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); + INIT_LIST_HEAD(&fs_info->unused_bgs); + INIT_LIST_HEAD(&fs_info->reclaim_bgs); + INIT_LIST_HEAD(&fs_info->zone_active_bgs); +#ifdef CONFIG_BTRFS_DEBUG + INIT_LIST_HEAD(&fs_info->allocated_roots); + INIT_LIST_HEAD(&fs_info->allocated_ebs); + spin_lock_init(&fs_info->eb_leak_lock); +#endif + extent_map_tree_init(&fs_info->mapping_tree); + btrfs_init_block_rsv(&fs_info->global_block_rsv, + BTRFS_BLOCK_RSV_GLOBAL); + btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); + btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); + btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); + btrfs_init_block_rsv(&fs_info->delayed_block_rsv, + BTRFS_BLOCK_RSV_DELOPS); + btrfs_init_block_rsv(&fs_info->delayed_refs_rsv, + BTRFS_BLOCK_RSV_DELREFS); + + atomic_set(&fs_info->async_delalloc_pages, 0); + atomic_set(&fs_info->defrag_running, 0); + atomic_set(&fs_info->nr_delayed_iputs, 0); + atomic64_set(&fs_info->tree_mod_seq, 0); + fs_info->global_root_tree = RB_ROOT; + fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; + fs_info->metadata_ratio = 0; + fs_info->defrag_inodes = RB_ROOT; + atomic64_set(&fs_info->free_chunk_space, 0); + fs_info->tree_mod_log = RB_ROOT; + fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + btrfs_init_ref_verify(fs_info); + + fs_info->thread_pool_size = min_t(unsigned long, + num_online_cpus() + 2, 8); + + INIT_LIST_HEAD(&fs_info->ordered_roots); + spin_lock_init(&fs_info->ordered_root_lock); + + btrfs_init_scrub(fs_info); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + fs_info->check_integrity_print_mask = 0; +#endif + btrfs_init_balance(fs_info); + btrfs_init_async_reclaim_work(fs_info); + + rwlock_init(&fs_info->block_group_cache_lock); + fs_info->block_group_cache_tree = RB_ROOT_CACHED; + + extent_io_tree_init(fs_info, &fs_info->excluded_extents, + IO_TREE_FS_EXCLUDED_EXTENTS); + + mutex_init(&fs_info->ordered_operations_mutex); + mutex_init(&fs_info->tree_log_mutex); + mutex_init(&fs_info->chunk_mutex); + mutex_init(&fs_info->transaction_kthread_mutex); + mutex_init(&fs_info->cleaner_mutex); + mutex_init(&fs_info->ro_block_group_mutex); + init_rwsem(&fs_info->commit_root_sem); + init_rwsem(&fs_info->cleanup_work_sem); + init_rwsem(&fs_info->subvol_sem); + sema_init(&fs_info->uuid_tree_rescan_sem, 1); + + btrfs_init_dev_replace_locks(fs_info); + btrfs_init_qgroup(fs_info); + btrfs_discard_init(fs_info); + + btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); + btrfs_init_free_cluster(&fs_info->data_alloc_cluster); + + init_waitqueue_head(&fs_info->transaction_throttle); + init_waitqueue_head(&fs_info->transaction_wait); + init_waitqueue_head(&fs_info->transaction_blocked_wait); + init_waitqueue_head(&fs_info->async_submit_wait); + init_waitqueue_head(&fs_info->delayed_iputs_wait); + + /* Usable values until the real ones are cached from the superblock */ + fs_info->nodesize = 4096; + fs_info->sectorsize = 4096; + fs_info->sectorsize_bits = ilog2(4096); + fs_info->stripesize = 4096; + + fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE; + + spin_lock_init(&fs_info->swapfile_pins_lock); + fs_info->swapfile_pins = RB_ROOT; + + fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH; + INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work); +} + +static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) +{ + int ret; + + fs_info->sb = sb; + sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; + sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); + + ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL); + if (ret) + return ret; + + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); + if (ret) + return ret; + + fs_info->dirty_metadata_batch = PAGE_SIZE * + (1 + ilog2(nr_cpu_ids)); + + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); + if (ret) + return ret; + + ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, + GFP_KERNEL); + if (ret) + return ret; + + fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), + GFP_KERNEL); + if (!fs_info->delayed_root) + return -ENOMEM; + btrfs_init_delayed_root(fs_info->delayed_root); + + if (sb_rdonly(sb)) + set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); + + return btrfs_alloc_stripe_hash_table(fs_info); +} + +static int btrfs_uuid_rescan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + int ret; + + /* + * 1st step is to iterate through the existing UUID tree and + * to delete all entries that contain outdated data. + * 2nd step is to add all missing entries to the UUID tree. + */ + ret = btrfs_uuid_tree_iterate(fs_info); + if (ret < 0) { + if (ret != -EINTR) + btrfs_warn(fs_info, "iterating uuid_tree failed %d", + ret); + up(&fs_info->uuid_tree_rescan_sem); + return ret; + } + return btrfs_uuid_scan_kthread(data); +} + +static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct task_struct *task; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + btrfs_warn(fs_info, "failed to start uuid_rescan task"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} + +static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) +{ + u64 root_objectid = 0; + struct btrfs_root *gang[8]; + int i = 0; + int err = 0; + unsigned int ret = 0; + + while (1) { + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang)); + if (!ret) { + spin_unlock(&fs_info->fs_roots_radix_lock); + break; + } + root_objectid = gang[ret - 1]->root_key.objectid + 1; + + for (i = 0; i < ret; i++) { + /* Avoid to grab roots in dead_roots. */ + if (btrfs_root_refs(&gang[i]->root_item) == 0) { + gang[i] = NULL; + continue; + } + /* Grab all the search result for later use. */ + gang[i] = btrfs_grab_root(gang[i]); + } + spin_unlock(&fs_info->fs_roots_radix_lock); + + for (i = 0; i < ret; i++) { + if (!gang[i]) + continue; + root_objectid = gang[i]->root_key.objectid; + err = btrfs_orphan_cleanup(gang[i]); + if (err) + goto out; + btrfs_put_root(gang[i]); + } + root_objectid++; + } +out: + /* Release the uncleaned roots due to error. */ + for (; i < ret; i++) { + if (gang[i]) + btrfs_put_root(gang[i]); + } + return err; +} + +/* + * Some options only have meaning at mount time and shouldn't persist across + * remounts, or be displayed. Clear these at the end of mount and remount + * code paths. + */ +void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) +{ + btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); + btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE); +} + +/* + * Mounting logic specific to read-write file systems. Shared by open_ctree + * and btrfs_remount when remounting from read-only to read-write. + */ +int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) +{ + int ret; + const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); + bool rebuild_free_space_tree = false; + + if (btrfs_test_opt(fs_info, CLEAR_CACHE) && + btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + rebuild_free_space_tree = true; + } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { + btrfs_warn(fs_info, "free space tree is invalid"); + rebuild_free_space_tree = true; + } + + if (rebuild_free_space_tree) { + btrfs_info(fs_info, "rebuilding free space tree"); + ret = btrfs_rebuild_free_space_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to rebuild free space tree: %d", ret); + goto out; + } + } + + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + !btrfs_test_opt(fs_info, FREE_SPACE_TREE)) { + btrfs_info(fs_info, "disabling free space tree"); + ret = btrfs_delete_free_space_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to disable free space tree: %d", ret); + goto out; + } + } + + /* + * btrfs_find_orphan_roots() is responsible for finding all the dead + * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load + * them into the fs_info->fs_roots_radix tree. This must be done before + * calling btrfs_orphan_cleanup() on the tree root. If we don't do it + * first, then btrfs_orphan_cleanup() will delete a dead root's orphan + * item before the root's tree is deleted - this means that if we unmount + * or crash before the deletion completes, on the next mount we will not + * delete what remains of the tree because the orphan item does not + * exists anymore, which is what tells us we have a pending deletion. + */ + ret = btrfs_find_orphan_roots(fs_info); + if (ret) + goto out; + + ret = btrfs_cleanup_fs_roots(fs_info); + if (ret) + goto out; + + down_read(&fs_info->cleanup_work_sem); + if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || + (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { + up_read(&fs_info->cleanup_work_sem); + goto out; + } + up_read(&fs_info->cleanup_work_sem); + + mutex_lock(&fs_info->cleaner_mutex); + ret = btrfs_recover_relocation(fs_info); + mutex_unlock(&fs_info->cleaner_mutex); + if (ret < 0) { + btrfs_warn(fs_info, "failed to recover relocation: %d", ret); + goto out; + } + + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) && + !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + btrfs_info(fs_info, "creating free space tree"); + ret = btrfs_create_free_space_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to create free space tree: %d", ret); + goto out; + } + } + + if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) { + ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); + if (ret) + goto out; + } + + ret = btrfs_resume_balance_async(fs_info); + if (ret) + goto out; + + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + btrfs_warn(fs_info, "failed to resume dev_replace"); + goto out; + } + + btrfs_qgroup_rescan_resume(fs_info); + + if (!fs_info->uuid_root) { + btrfs_info(fs_info, "creating UUID tree"); + ret = btrfs_create_uuid_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to create the UUID tree %d", ret); + goto out; + } + } + +out: + return ret; +} + +/* + * Do various sanity and dependency checks of different features. + * + * @is_rw_mount: If the mount is read-write. + * + * This is the place for less strict checks (like for subpage or artificial + * feature dependencies). + * + * For strict checks or possible corruption detection, see + * btrfs_validate_super(). + * + * This should be called after btrfs_parse_options(), as some mount options + * (space cache related) can modify on-disk format like free space tree and + * screw up certain feature dependencies. + */ +int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) +{ + struct btrfs_super_block *disk_super = fs_info->super_copy; + u64 incompat = btrfs_super_incompat_flags(disk_super); + const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super); + const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP); + + if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) { + btrfs_err(fs_info, + "cannot mount because of unknown incompat features (0x%llx)", + incompat); + return -EINVAL; + } + + /* Runtime limitation for mixed block groups. */ + if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && + (fs_info->sectorsize != fs_info->nodesize)) { + btrfs_err(fs_info, +"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", + fs_info->nodesize, fs_info->sectorsize); + return -EINVAL; + } + + /* Mixed backref is an always-enabled feature. */ + incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; + + /* Set compression related flags just in case. */ + if (fs_info->compress_type == BTRFS_COMPRESS_LZO) + incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) + incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; + + /* + * An ancient flag, which should really be marked deprecated. + * Such runtime limitation doesn't really need a incompat flag. + */ + if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) + incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; + + if (compat_ro_unsupp && is_rw_mount) { + btrfs_err(fs_info, + "cannot mount read-write because of unknown compat_ro features (0x%llx)", + compat_ro); + return -EINVAL; + } + + /* + * We have unsupported RO compat features, although RO mounted, we + * should not cause any metadata writes, including log replay. + * Or we could screw up whatever the new feature requires. + */ + if (compat_ro_unsupp && btrfs_super_log_root(disk_super) && + !btrfs_test_opt(fs_info, NOLOGREPLAY)) { + btrfs_err(fs_info, +"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay", + compat_ro); + return -EINVAL; + } + + /* + * Artificial limitations for block group tree, to force + * block-group-tree to rely on no-holes and free-space-tree. + */ + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) && + (!btrfs_fs_incompat(fs_info, NO_HOLES) || + !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) { + btrfs_err(fs_info, +"block-group-tree feature requires no-holes and free-space-tree features"); + return -EINVAL; + } + + /* + * Subpage runtime limitation on v1 cache. + * + * V1 space cache still has some hard codeed PAGE_SIZE usage, while + * we're already defaulting to v2 cache, no need to bother v1 as it's + * going to be deprecated anyway. + */ + if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { + btrfs_warn(fs_info, + "v1 space cache is not supported for page size %lu with sectorsize %u", + PAGE_SIZE, fs_info->sectorsize); + return -EINVAL; + } + + /* This can be called by remount, we need to protect the super block. */ + spin_lock(&fs_info->super_lock); + btrfs_set_super_incompat_flags(disk_super, incompat); + spin_unlock(&fs_info->super_lock); + + return 0; +} + +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, + char *options) +{ + u32 sectorsize; + u32 nodesize; + u32 stripesize; + u64 generation; + u64 features; + u16 csum_type; + struct btrfs_super_block *disk_super; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + int ret; + int level; + + ret = init_mount_fs_info(fs_info, sb); + if (ret) + goto fail; + + /* These need to be init'ed before we start creating inodes and such. */ + tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, + GFP_KERNEL); + fs_info->tree_root = tree_root; + chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID, + GFP_KERNEL); + fs_info->chunk_root = chunk_root; + if (!tree_root || !chunk_root) { + ret = -ENOMEM; + goto fail; + } + + ret = btrfs_init_btree_inode(sb); + if (ret) + goto fail; + + invalidate_bdev(fs_devices->latest_dev->bdev); + + /* + * Read super block and check the signature bytes only + */ + disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev); + if (IS_ERR(disk_super)) { + ret = PTR_ERR(disk_super); + goto fail_alloc; + } + + btrfs_info(fs_info, "first mount of filesystem %pU", disk_super->fsid); + /* + * Verify the type first, if that or the checksum value are + * corrupted, we'll find out + */ + csum_type = btrfs_super_csum_type(disk_super); + if (!btrfs_supported_super_csum(csum_type)) { + btrfs_err(fs_info, "unsupported checksum algorithm: %u", + csum_type); + ret = -EINVAL; + btrfs_release_disk_super(disk_super); + goto fail_alloc; + } + + fs_info->csum_size = btrfs_super_csum_size(disk_super); + + ret = btrfs_init_csum_hash(fs_info, csum_type); + if (ret) { + btrfs_release_disk_super(disk_super); + goto fail_alloc; + } + + /* + * We want to check superblock checksum, the type is stored inside. + * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). + */ + if (btrfs_check_super_csum(fs_info, disk_super)) { + btrfs_err(fs_info, "superblock checksum mismatch"); + ret = -EINVAL; + btrfs_release_disk_super(disk_super); + goto fail_alloc; + } + + /* + * super_copy is zeroed at allocation time and we never touch the + * following bytes up to INFO_SIZE, the checksum is calculated from + * the whole block of INFO_SIZE + */ + memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy)); + btrfs_release_disk_super(disk_super); + + disk_super = fs_info->super_copy; + + + features = btrfs_super_flags(disk_super); + if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { + features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2; + btrfs_set_super_flags(disk_super, features); + btrfs_info(fs_info, + "found metadata UUID change in progress flag, clearing"); + } + + memcpy(fs_info->super_for_commit, fs_info->super_copy, + sizeof(*fs_info->super_for_commit)); + + ret = btrfs_validate_mount_super(fs_info); + if (ret) { + btrfs_err(fs_info, "superblock contains fatal errors"); + ret = -EINVAL; + goto fail_alloc; + } + + if (!btrfs_super_root(disk_super)) { + btrfs_err(fs_info, "invalid superblock tree root bytenr"); + ret = -EINVAL; + goto fail_alloc; + } + + /* check FS state, whether FS is broken. */ + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) + WRITE_ONCE(fs_info->fs_error, -EUCLEAN); + + /* + * In the long term, we'll store the compression type in the super + * block, and it'll be used for per file compression control. + */ + fs_info->compress_type = BTRFS_COMPRESS_ZLIB; + + + /* Set up fs_info before parsing mount options */ + nodesize = btrfs_super_nodesize(disk_super); + sectorsize = btrfs_super_sectorsize(disk_super); + stripesize = sectorsize; + fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); + fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); + + fs_info->nodesize = nodesize; + fs_info->sectorsize = sectorsize; + fs_info->sectorsize_bits = ilog2(sectorsize); + fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; + fs_info->stripesize = stripesize; + + ret = btrfs_parse_options(fs_info, options, sb->s_flags); + if (ret) + goto fail_alloc; + + ret = btrfs_check_features(fs_info, !sb_rdonly(sb)); + if (ret < 0) + goto fail_alloc; + + if (sectorsize < PAGE_SIZE) { + struct btrfs_subpage_info *subpage_info; + + /* + * V1 space cache has some hardcoded PAGE_SIZE usage, and is + * going to be deprecated. + * + * Force to use v2 cache for subpage case. + */ + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + btrfs_set_and_info(fs_info, FREE_SPACE_TREE, + "forcing free space tree for sector size %u with page size %lu", + sectorsize, PAGE_SIZE); + + btrfs_warn(fs_info, + "read-write for sector size %u with page size %lu is experimental", + sectorsize, PAGE_SIZE); + subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); + if (!subpage_info) { + ret = -ENOMEM; + goto fail_alloc; + } + btrfs_init_subpage_info(subpage_info, sectorsize); + fs_info->subpage_info = subpage_info; + } + + ret = btrfs_init_workqueues(fs_info); + if (ret) + goto fail_sb_buffer; + + sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); + sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); + + sb->s_blocksize = sectorsize; + sb->s_blocksize_bits = blksize_bits(sectorsize); + memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE); + + mutex_lock(&fs_info->chunk_mutex); + ret = btrfs_read_sys_array(fs_info); + mutex_unlock(&fs_info->chunk_mutex); + if (ret) { + btrfs_err(fs_info, "failed to read the system array: %d", ret); + goto fail_sb_buffer; + } + + generation = btrfs_super_chunk_root_generation(disk_super); + level = btrfs_super_chunk_root_level(disk_super); + ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super), + generation, level); + if (ret) { + btrfs_err(fs_info, "failed to read chunk root"); + goto fail_tree_roots; + } + + read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, + offsetof(struct btrfs_header, chunk_tree_uuid), + BTRFS_UUID_SIZE); + + ret = btrfs_read_chunk_tree(fs_info); + if (ret) { + btrfs_err(fs_info, "failed to read chunk tree: %d", ret); + goto fail_tree_roots; + } + + /* + * At this point we know all the devices that make this filesystem, + * including the seed devices but we don't know yet if the replace + * target is required. So free devices that are not part of this + * filesystem but skip the replace target device which is checked + * below in btrfs_init_dev_replace(). + */ + btrfs_free_extra_devids(fs_devices); + if (!fs_devices->latest_dev->bdev) { + btrfs_err(fs_info, "failed to read devices"); + ret = -EIO; + goto fail_tree_roots; + } + + ret = init_tree_roots(fs_info); + if (ret) + goto fail_tree_roots; + + /* + * Get zone type information of zoned block devices. This will also + * handle emulation of a zoned filesystem if a regular device has the + * zoned incompat feature flag set. + */ + ret = btrfs_get_dev_zone_info_all_devices(fs_info); + if (ret) { + btrfs_err(fs_info, + "zoned: failed to read device zone info: %d", ret); + goto fail_block_groups; + } + + /* + * If we have a uuid root and we're not being told to rescan we need to + * check the generation here so we can set the + * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the + * transaction during a balance or the log replay without updating the + * uuid generation, and then if we crash we would rescan the uuid tree, + * even though it was perfectly fine. + */ + if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) && + fs_info->generation == btrfs_super_uuid_tree_generation(disk_super)) + set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); + + ret = btrfs_verify_dev_extents(fs_info); + if (ret) { + btrfs_err(fs_info, + "failed to verify dev extents against chunks: %d", + ret); + goto fail_block_groups; + } + ret = btrfs_recover_balance(fs_info); + if (ret) { + btrfs_err(fs_info, "failed to recover balance: %d", ret); + goto fail_block_groups; + } + + ret = btrfs_init_dev_stats(fs_info); + if (ret) { + btrfs_err(fs_info, "failed to init dev_stats: %d", ret); + goto fail_block_groups; + } + + ret = btrfs_init_dev_replace(fs_info); + if (ret) { + btrfs_err(fs_info, "failed to init dev_replace: %d", ret); + goto fail_block_groups; + } + + ret = btrfs_check_zoned_mode(fs_info); + if (ret) { + btrfs_err(fs_info, "failed to initialize zoned mode: %d", + ret); + goto fail_block_groups; + } + + ret = btrfs_sysfs_add_fsid(fs_devices); + if (ret) { + btrfs_err(fs_info, "failed to init sysfs fsid interface: %d", + ret); + goto fail_block_groups; + } + + ret = btrfs_sysfs_add_mounted(fs_info); + if (ret) { + btrfs_err(fs_info, "failed to init sysfs interface: %d", ret); + goto fail_fsdev_sysfs; + } + + ret = btrfs_init_space_info(fs_info); + if (ret) { + btrfs_err(fs_info, "failed to initialize space info: %d", ret); + goto fail_sysfs; + } + + ret = btrfs_read_block_groups(fs_info); + if (ret) { + btrfs_err(fs_info, "failed to read block groups: %d", ret); + goto fail_sysfs; + } + + btrfs_free_zone_cache(fs_info); + + btrfs_check_active_zone_reservation(fs_info); + + if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices && + !btrfs_check_rw_degradable(fs_info, NULL)) { + btrfs_warn(fs_info, + "writable mount is not allowed due to too many missing devices"); + ret = -EINVAL; + goto fail_sysfs; + } + + fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info, + "btrfs-cleaner"); + if (IS_ERR(fs_info->cleaner_kthread)) { + ret = PTR_ERR(fs_info->cleaner_kthread); + goto fail_sysfs; + } + + fs_info->transaction_kthread = kthread_run(transaction_kthread, + tree_root, + "btrfs-transaction"); + if (IS_ERR(fs_info->transaction_kthread)) { + ret = PTR_ERR(fs_info->transaction_kthread); + goto fail_cleaner; + } + + if (!btrfs_test_opt(fs_info, NOSSD) && + !fs_info->fs_devices->rotating) { + btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations"); + } + + /* + * For devices supporting discard turn on discard=async automatically, + * unless it's already set or disabled. This could be turned off by + * nodiscard for the same mount. + * + * The zoned mode piggy backs on the discard functionality for + * resetting a zone. There is no reason to delay the zone reset as it is + * fast enough. So, do not enable async discard for zoned mode. + */ + if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_test_opt(fs_info, DISCARD_ASYNC) || + btrfs_test_opt(fs_info, NODISCARD)) && + fs_info->fs_devices->discardable && + !btrfs_is_zoned(fs_info)) { + btrfs_set_and_info(fs_info, DISCARD_ASYNC, + "auto enabling async discard"); + } + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) { + ret = btrfsic_mount(fs_info, fs_devices, + btrfs_test_opt(fs_info, + CHECK_INTEGRITY_DATA) ? 1 : 0, + fs_info->check_integrity_print_mask); + if (ret) + btrfs_warn(fs_info, + "failed to initialize integrity check module: %d", + ret); + } +#endif + ret = btrfs_read_qgroup_config(fs_info); + if (ret) + goto fail_trans_kthread; + + if (btrfs_build_ref_tree(fs_info)) + btrfs_err(fs_info, "couldn't build ref tree"); + + /* do not make disk changes in broken FS or nologreplay is given */ + if (btrfs_super_log_root(disk_super) != 0 && + !btrfs_test_opt(fs_info, NOLOGREPLAY)) { + btrfs_info(fs_info, "start tree-log replay"); + ret = btrfs_replay_log(fs_info, fs_devices); + if (ret) + goto fail_qgroup; + } + + fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true); + if (IS_ERR(fs_info->fs_root)) { + ret = PTR_ERR(fs_info->fs_root); + btrfs_warn(fs_info, "failed to read fs tree: %d", ret); + fs_info->fs_root = NULL; + goto fail_qgroup; + } + + if (sb_rdonly(sb)) + goto clear_oneshot; + + ret = btrfs_start_pre_rw_mount(fs_info); + if (ret) { + close_ctree(fs_info); + return ret; + } + btrfs_discard_resume(fs_info); + + if (fs_info->uuid_root && + (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) || + fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) { + btrfs_info(fs_info, "checking UUID tree"); + ret = btrfs_check_uuid_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to check the UUID tree: %d", ret); + close_ctree(fs_info); + return ret; + } + } + + set_bit(BTRFS_FS_OPEN, &fs_info->flags); + + /* Kick the cleaner thread so it'll start deleting snapshots. */ + if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags)) + wake_up_process(fs_info->cleaner_kthread); + +clear_oneshot: + btrfs_clear_oneshot_options(fs_info); + return 0; + +fail_qgroup: + btrfs_free_qgroup_config(fs_info); +fail_trans_kthread: + kthread_stop(fs_info->transaction_kthread); + btrfs_cleanup_transaction(fs_info); + btrfs_free_fs_roots(fs_info); +fail_cleaner: + kthread_stop(fs_info->cleaner_kthread); + + /* + * make sure we're done with the btree inode before we stop our + * kthreads + */ + filemap_write_and_wait(fs_info->btree_inode->i_mapping); + +fail_sysfs: + btrfs_sysfs_remove_mounted(fs_info); + +fail_fsdev_sysfs: + btrfs_sysfs_remove_fsid(fs_info->fs_devices); + +fail_block_groups: + btrfs_put_block_group_cache(fs_info); + +fail_tree_roots: + if (fs_info->data_reloc_root) + btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root); + free_root_pointers(fs_info, true); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + +fail_sb_buffer: + btrfs_stop_all_workers(fs_info); + btrfs_free_block_groups(fs_info); +fail_alloc: + btrfs_mapping_tree_free(&fs_info->mapping_tree); + + iput(fs_info->btree_inode); +fail: + btrfs_close_devices(fs_info->fs_devices); + ASSERT(ret < 0); + return ret; +} +ALLOW_ERROR_INJECTION(open_ctree, ERRNO); + +static void btrfs_end_super_write(struct bio *bio) +{ + struct btrfs_device *device = bio->bi_private; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + struct page *page; + + bio_for_each_segment_all(bvec, bio, iter_all) { + page = bvec->bv_page; + + if (bio->bi_status) { + btrfs_warn_rl_in_rcu(device->fs_info, + "lost page write due to IO error on %s (%d)", + btrfs_dev_name(device), + blk_status_to_errno(bio->bi_status)); + ClearPageUptodate(page); + SetPageError(page); + btrfs_dev_stat_inc_and_print(device, + BTRFS_DEV_STAT_WRITE_ERRS); + } else { + SetPageUptodate(page); + } + + put_page(page); + unlock_page(page); + } + + bio_put(bio); +} + +struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, + int copy_num, bool drop_cache) +{ + struct btrfs_super_block *super; + struct page *page; + u64 bytenr, bytenr_orig; + struct address_space *mapping = bdev->bd_inode->i_mapping; + int ret; + + bytenr_orig = btrfs_sb_offset(copy_num); + ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr); + if (ret == -ENOENT) + return ERR_PTR(-EINVAL); + else if (ret) + return ERR_PTR(ret); + + if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) + return ERR_PTR(-EINVAL); + + if (drop_cache) { + /* This should only be called with the primary sb. */ + ASSERT(copy_num == 0); + + /* + * Drop the page of the primary superblock, so later read will + * always read from the device. + */ + invalidate_inode_pages2_range(mapping, + bytenr >> PAGE_SHIFT, + (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT); + } + + page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); + if (IS_ERR(page)) + return ERR_CAST(page); + + super = page_address(page); + if (btrfs_super_magic(super) != BTRFS_MAGIC) { + btrfs_release_disk_super(super); + return ERR_PTR(-ENODATA); + } + + if (btrfs_super_bytenr(super) != bytenr_orig) { + btrfs_release_disk_super(super); + return ERR_PTR(-EINVAL); + } + + return super; +} + + +struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) +{ + struct btrfs_super_block *super, *latest = NULL; + int i; + u64 transid = 0; + + /* we would like to check all the supers, but that would make + * a btrfs mount succeed after a mkfs from a different FS. + * So, we need to add a special mount option to scan for + * later supers, using BTRFS_SUPER_MIRROR_MAX instead + */ + for (i = 0; i < 1; i++) { + super = btrfs_read_dev_one_super(bdev, i, false); + if (IS_ERR(super)) + continue; + + if (!latest || btrfs_super_generation(super) > transid) { + if (latest) + btrfs_release_disk_super(super); + + latest = super; + transid = btrfs_super_generation(super); + } + } + + return super; +} + +/* + * Write superblock @sb to the @device. Do not wait for completion, all the + * pages we use for writing are locked. + * + * Write @max_mirrors copies of the superblock, where 0 means default that fit + * the expected device size at commit time. Note that max_mirrors must be + * same for write and wait phases. + * + * Return number of errors when page is not found or submission fails. + */ +static int write_dev_supers(struct btrfs_device *device, + struct btrfs_super_block *sb, int max_mirrors) +{ + struct btrfs_fs_info *fs_info = device->fs_info; + struct address_space *mapping = device->bdev->bd_inode->i_mapping; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + int i; + int errors = 0; + int ret; + u64 bytenr, bytenr_orig; + + if (max_mirrors == 0) + max_mirrors = BTRFS_SUPER_MIRROR_MAX; + + shash->tfm = fs_info->csum_shash; + + for (i = 0; i < max_mirrors; i++) { + struct page *page; + struct bio *bio; + struct btrfs_super_block *disk_super; + + bytenr_orig = btrfs_sb_offset(i); + ret = btrfs_sb_log_location(device, i, WRITE, &bytenr); + if (ret == -ENOENT) { + continue; + } else if (ret < 0) { + btrfs_err(device->fs_info, + "couldn't get super block location for mirror %d", + i); + errors++; + continue; + } + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + device->commit_total_bytes) + break; + + btrfs_set_super_bytenr(sb, bytenr_orig); + + crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, + sb->csum); + + page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, + GFP_NOFS); + if (!page) { + btrfs_err(device->fs_info, + "couldn't get super block page for bytenr %llu", + bytenr); + errors++; + continue; + } + + /* Bump the refcount for wait_dev_supers() */ + get_page(page); + + disk_super = page_address(page); + memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE); + + /* + * Directly use bios here instead of relying on the page cache + * to do I/O, so we don't lose the ability to do integrity + * checking. + */ + bio = bio_alloc(device->bdev, 1, + REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, + GFP_NOFS); + bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; + bio->bi_private = device; + bio->bi_end_io = btrfs_end_super_write; + __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE, + offset_in_page(bytenr)); + + /* + * We FUA only the first super block. The others we allow to + * go down lazy and there's a short window where the on-disk + * copies might still contain the older version. + */ + if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) + bio->bi_opf |= REQ_FUA; + + btrfsic_check_bio(bio); + submit_bio(bio); + + if (btrfs_advance_sb_log(device, i)) + errors++; + } + return errors < i ? 0 : -1; +} + +/* + * Wait for write completion of superblocks done by write_dev_supers, + * @max_mirrors same for write and wait phases. + * + * Return number of errors when page is not found or not marked up to + * date. + */ +static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) +{ + int i; + int errors = 0; + bool primary_failed = false; + int ret; + u64 bytenr; + + if (max_mirrors == 0) + max_mirrors = BTRFS_SUPER_MIRROR_MAX; + + for (i = 0; i < max_mirrors; i++) { + struct page *page; + + ret = btrfs_sb_log_location(device, i, READ, &bytenr); + if (ret == -ENOENT) { + break; + } else if (ret < 0) { + errors++; + if (i == 0) + primary_failed = true; + continue; + } + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + device->commit_total_bytes) + break; + + page = find_get_page(device->bdev->bd_inode->i_mapping, + bytenr >> PAGE_SHIFT); + if (!page) { + errors++; + if (i == 0) + primary_failed = true; + continue; + } + /* Page is submitted locked and unlocked once the IO completes */ + wait_on_page_locked(page); + if (PageError(page)) { + errors++; + if (i == 0) + primary_failed = true; + } + + /* Drop our reference */ + put_page(page); + + /* Drop the reference from the writing run */ + put_page(page); + } + + /* log error, force error return */ + if (primary_failed) { + btrfs_err(device->fs_info, "error writing primary super block to device %llu", + device->devid); + return -1; + } + + return errors < i ? 0 : -1; +} + +/* + * endio for the write_dev_flush, this will wake anyone waiting + * for the barrier when it is done + */ +static void btrfs_end_empty_barrier(struct bio *bio) +{ + bio_uninit(bio); + complete(bio->bi_private); +} + +/* + * Submit a flush request to the device if it supports it. Error handling is + * done in the waiting counterpart. + */ +static void write_dev_flush(struct btrfs_device *device) +{ + struct bio *bio = &device->flush_bio; + + device->last_flush_error = BLK_STS_OK; + +#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY + /* + * When a disk has write caching disabled, we skip submission of a bio + * with flush and sync requests before writing the superblock, since + * it's not needed. However when the integrity checker is enabled, this + * results in reports that there are metadata blocks referred by a + * superblock that were not properly flushed. So don't skip the bio + * submission only when the integrity checker is enabled for the sake + * of simplicity, since this is a debug tool and not meant for use in + * non-debug builds. + */ + if (!bdev_write_cache(device->bdev)) + return; +#endif + + bio_init(bio, device->bdev, NULL, 0, + REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); + bio->bi_end_io = btrfs_end_empty_barrier; + init_completion(&device->flush_wait); + bio->bi_private = &device->flush_wait; + + btrfsic_check_bio(bio); + submit_bio(bio); + set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); +} + +/* + * If the flush bio has been submitted by write_dev_flush, wait for it. + * Return true for any error, and false otherwise. + */ +static bool wait_dev_flush(struct btrfs_device *device) +{ + struct bio *bio = &device->flush_bio; + + if (!test_and_clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) + return false; + + wait_for_completion_io(&device->flush_wait); + + if (bio->bi_status) { + device->last_flush_error = bio->bi_status; + btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); + return true; + } + + return false; +} + +/* + * send an empty flush down to each device in parallel, + * then wait for them + */ +static int barrier_all_devices(struct btrfs_fs_info *info) +{ + struct list_head *head; + struct btrfs_device *dev; + int errors_wait = 0; + + lockdep_assert_held(&info->fs_devices->device_list_mutex); + /* send down all the barriers */ + head = &info->fs_devices->devices; + list_for_each_entry(dev, head, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) + continue; + if (!dev->bdev) + continue; + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) + continue; + + write_dev_flush(dev); + } + + /* wait for all the barriers */ + list_for_each_entry(dev, head, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) + continue; + if (!dev->bdev) { + errors_wait++; + continue; + } + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) + continue; + + if (wait_dev_flush(dev)) + errors_wait++; + } + + /* + * Checks last_flush_error of disks in order to determine the device + * state. + */ + if (errors_wait && !btrfs_check_rw_degradable(info, NULL)) + return -EIO; + + return 0; +} + +int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) +{ + int raid_type; + int min_tolerated = INT_MAX; + + if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 || + (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE)) + min_tolerated = min_t(int, min_tolerated, + btrfs_raid_array[BTRFS_RAID_SINGLE]. + tolerated_failures); + + for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { + if (raid_type == BTRFS_RAID_SINGLE) + continue; + if (!(flags & btrfs_raid_array[raid_type].bg_flag)) + continue; + min_tolerated = min_t(int, min_tolerated, + btrfs_raid_array[raid_type]. + tolerated_failures); + } + + if (min_tolerated == INT_MAX) { + pr_warn("BTRFS: unknown raid flag: %llu", flags); + min_tolerated = 0; + } + + return min_tolerated; +} + +int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) +{ + struct list_head *head; + struct btrfs_device *dev; + struct btrfs_super_block *sb; + struct btrfs_dev_item *dev_item; + int ret; + int do_barriers; + int max_errors; + int total_errors = 0; + u64 flags; + + do_barriers = !btrfs_test_opt(fs_info, NOBARRIER); + + /* + * max_mirrors == 0 indicates we're from commit_transaction, + * not from fsync where the tree roots in fs_info have not + * been consistent on disk. + */ + if (max_mirrors == 0) + backup_super_roots(fs_info); + + sb = fs_info->super_for_commit; + dev_item = &sb->dev_item; + + mutex_lock(&fs_info->fs_devices->device_list_mutex); + head = &fs_info->fs_devices->devices; + max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1; + + if (do_barriers) { + ret = barrier_all_devices(fs_info); + if (ret) { + mutex_unlock( + &fs_info->fs_devices->device_list_mutex); + btrfs_handle_fs_error(fs_info, ret, + "errors while submitting device barriers."); + return ret; + } + } + + list_for_each_entry(dev, head, dev_list) { + if (!dev->bdev) { + total_errors++; + continue; + } + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) + continue; + + btrfs_set_stack_device_generation(dev_item, 0); + btrfs_set_stack_device_type(dev_item, dev->type); + btrfs_set_stack_device_id(dev_item, dev->devid); + btrfs_set_stack_device_total_bytes(dev_item, + dev->commit_total_bytes); + btrfs_set_stack_device_bytes_used(dev_item, + dev->commit_bytes_used); + btrfs_set_stack_device_io_align(dev_item, dev->io_align); + btrfs_set_stack_device_io_width(dev_item, dev->io_width); + btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); + memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); + memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid, + BTRFS_FSID_SIZE); + + flags = btrfs_super_flags(sb); + btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); + + ret = btrfs_validate_write_super(fs_info, sb); + if (ret < 0) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + btrfs_handle_fs_error(fs_info, -EUCLEAN, + "unexpected superblock corruption detected"); + return -EUCLEAN; + } + + ret = write_dev_supers(dev, sb, max_mirrors); + if (ret) + total_errors++; + } + if (total_errors > max_errors) { + btrfs_err(fs_info, "%d errors while writing supers", + total_errors); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + /* FUA is masked off if unsupported and can't be the reason */ + btrfs_handle_fs_error(fs_info, -EIO, + "%d errors while writing supers", + total_errors); + return -EIO; + } + + total_errors = 0; + list_for_each_entry(dev, head, dev_list) { + if (!dev->bdev) + continue; + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) + continue; + + ret = wait_dev_supers(dev, max_mirrors); + if (ret) + total_errors++; + } + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + if (total_errors > max_errors) { + btrfs_handle_fs_error(fs_info, -EIO, + "%d errors while writing supers", + total_errors); + return -EIO; + } + return 0; +} + +/* Drop a fs root from the radix tree and free it. */ +void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root) +{ + bool drop_ref = false; + + spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_delete(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid); + if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) + drop_ref = true; + spin_unlock(&fs_info->fs_roots_radix_lock); + + if (BTRFS_FS_ERROR(fs_info)) { + ASSERT(root->log_root == NULL); + if (root->reloc_root) { + btrfs_put_root(root->reloc_root); + root->reloc_root = NULL; + } + } + + if (drop_ref) + btrfs_put_root(root); +} + +int btrfs_commit_super(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_trans_handle *trans; + + mutex_lock(&fs_info->cleaner_mutex); + btrfs_run_delayed_iputs(fs_info); + mutex_unlock(&fs_info->cleaner_mutex); + wake_up_process(fs_info->cleaner_kthread); + + /* wait until ongoing cleanup work done */ + down_write(&fs_info->cleanup_work_sem); + up_write(&fs_info->cleanup_work_sem); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + return btrfs_commit_transaction(trans); +} + +static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) +{ + struct btrfs_transaction *trans; + struct btrfs_transaction *tmp; + bool found = false; + + if (list_empty(&fs_info->trans_list)) + return; + + /* + * This function is only called at the very end of close_ctree(), + * thus no other running transaction, no need to take trans_lock. + */ + ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)); + list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) { + struct extent_state *cached = NULL; + u64 dirty_bytes = 0; + u64 cur = 0; + u64 found_start; + u64 found_end; + + found = true; + while (find_first_extent_bit(&trans->dirty_pages, cur, + &found_start, &found_end, EXTENT_DIRTY, &cached)) { + dirty_bytes += found_end + 1 - found_start; + cur = found_end + 1; + } + btrfs_warn(fs_info, + "transaction %llu (with %llu dirty metadata bytes) is not committed", + trans->transid, dirty_bytes); + btrfs_cleanup_one_transaction(trans, fs_info); + + if (trans == fs_info->running_transaction) + fs_info->running_transaction = NULL; + list_del_init(&trans->list); + + btrfs_put_transaction(trans); + trace_btrfs_transaction_commit(fs_info); + } + ASSERT(!found); +} + +void __cold close_ctree(struct btrfs_fs_info *fs_info) +{ + int ret; + + set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); + + /* + * If we had UNFINISHED_DROPS we could still be processing them, so + * clear that bit and wake up relocation so it can stop. + * We must do this before stopping the block group reclaim task, because + * at btrfs_relocate_block_group() we wait for this bit, and after the + * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we + * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will + * return 1. + */ + btrfs_wake_unfinished_drop(fs_info); + + /* + * We may have the reclaim task running and relocating a data block group, + * in which case it may create delayed iputs. So stop it before we park + * the cleaner kthread otherwise we can get new delayed iputs after + * parking the cleaner, and that can make the async reclaim task to hang + * if it's waiting for delayed iputs to complete, since the cleaner is + * parked and can not run delayed iputs - this will make us hang when + * trying to stop the async reclaim task. + */ + cancel_work_sync(&fs_info->reclaim_bgs_work); + /* + * We don't want the cleaner to start new transactions, add more delayed + * iputs, etc. while we're closing. We can't use kthread_stop() yet + * because that frees the task_struct, and the transaction kthread might + * still try to wake up the cleaner. + */ + kthread_park(fs_info->cleaner_kthread); + + /* wait for the qgroup rescan worker to stop */ + btrfs_qgroup_wait_for_completion(fs_info, false); + + /* wait for the uuid_scan task to finish */ + down(&fs_info->uuid_tree_rescan_sem); + /* avoid complains from lockdep et al., set sem back to initial state */ + up(&fs_info->uuid_tree_rescan_sem); + + /* pause restriper - we want to resume on mount */ + btrfs_pause_balance(fs_info); + + btrfs_dev_replace_suspend_for_unmount(fs_info); + + btrfs_scrub_cancel(fs_info); + + /* wait for any defraggers to finish */ + wait_event(fs_info->transaction_wait, + (atomic_read(&fs_info->defrag_running) == 0)); + + /* clear out the rbtree of defraggable inodes */ + btrfs_cleanup_defrag_inodes(fs_info); + + /* + * After we parked the cleaner kthread, ordered extents may have + * completed and created new delayed iputs. If one of the async reclaim + * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we + * can hang forever trying to stop it, because if a delayed iput is + * added after it ran btrfs_run_delayed_iputs() and before it called + * btrfs_wait_on_delayed_iputs(), it will hang forever since there is + * no one else to run iputs. + * + * So wait for all ongoing ordered extents to complete and then run + * delayed iputs. This works because once we reach this point no one + * can either create new ordered extents nor create delayed iputs + * through some other means. + * + * Also note that btrfs_wait_ordered_roots() is not safe here, because + * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent, + * but the delayed iput for the respective inode is made only when doing + * the final btrfs_put_ordered_extent() (which must happen at + * btrfs_finish_ordered_io() when we are unmounting). + */ + btrfs_flush_workqueue(fs_info->endio_write_workers); + /* Ordered extents for free space inodes. */ + btrfs_flush_workqueue(fs_info->endio_freespace_worker); + btrfs_run_delayed_iputs(fs_info); + + cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); + cancel_work_sync(&fs_info->preempt_reclaim_work); + + /* Cancel or finish ongoing discard work */ + btrfs_discard_cleanup(fs_info); + + if (!sb_rdonly(fs_info->sb)) { + /* + * The cleaner kthread is stopped, so do one final pass over + * unused block groups. + */ + btrfs_delete_unused_bgs(fs_info); + + /* + * There might be existing delayed inode workers still running + * and holding an empty delayed inode item. We must wait for + * them to complete first because they can create a transaction. + * This happens when someone calls btrfs_balance_delayed_items() + * and then a transaction commit runs the same delayed nodes + * before any delayed worker has done something with the nodes. + * We must wait for any worker here and not at transaction + * commit time since that could cause a deadlock. + * This is a very rare case. + */ + btrfs_flush_workqueue(fs_info->delayed_workers); + + ret = btrfs_commit_super(fs_info); + if (ret) + btrfs_err(fs_info, "commit super ret %d", ret); + } + + if (BTRFS_FS_ERROR(fs_info)) + btrfs_error_commit_super(fs_info); + + kthread_stop(fs_info->transaction_kthread); + kthread_stop(fs_info->cleaner_kthread); + + ASSERT(list_empty(&fs_info->delayed_iputs)); + set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); + + if (btrfs_check_quota_leak(fs_info)) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_err(fs_info, "qgroup reserved space leaked"); + } + + btrfs_free_qgroup_config(fs_info); + ASSERT(list_empty(&fs_info->delalloc_roots)); + + if (percpu_counter_sum(&fs_info->delalloc_bytes)) { + btrfs_info(fs_info, "at unmount delalloc count %lld", + percpu_counter_sum(&fs_info->delalloc_bytes)); + } + + if (percpu_counter_sum(&fs_info->ordered_bytes)) + btrfs_info(fs_info, "at unmount dio bytes count %lld", + percpu_counter_sum(&fs_info->ordered_bytes)); + + btrfs_sysfs_remove_mounted(fs_info); + btrfs_sysfs_remove_fsid(fs_info->fs_devices); + + btrfs_put_block_group_cache(fs_info); + + /* + * we must make sure there is not any read request to + * submit after we stopping all workers. + */ + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + btrfs_stop_all_workers(fs_info); + + /* We shouldn't have any transaction open at this point */ + warn_about_uncommitted_trans(fs_info); + + clear_bit(BTRFS_FS_OPEN, &fs_info->flags); + free_root_pointers(fs_info, true); + btrfs_free_fs_roots(fs_info); + + /* + * We must free the block groups after dropping the fs_roots as we could + * have had an IO error and have left over tree log blocks that aren't + * cleaned up until the fs roots are freed. This makes the block group + * accounting appear to be wrong because there's pending reserved bytes, + * so make sure we do the block group cleanup afterwards. + */ + btrfs_free_block_groups(fs_info); + + iput(fs_info->btree_inode); + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) + btrfsic_unmount(fs_info->fs_devices); +#endif + + btrfs_mapping_tree_free(&fs_info->mapping_tree); + btrfs_close_devices(fs_info->fs_devices); +} + +void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf) +{ + struct btrfs_fs_info *fs_info = buf->fs_info; + u64 transid = btrfs_header_generation(buf); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + /* + * This is a fast path so only do this check if we have sanity tests + * enabled. Normal people shouldn't be using unmapped buffers as dirty + * outside of the sanity tests. + */ + if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) + return; +#endif + /* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */ + ASSERT(trans->transid == fs_info->generation); + btrfs_assert_tree_write_locked(buf); + if (transid != fs_info->generation) { + WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", + buf->start, transid, fs_info->generation); + btrfs_abort_transaction(trans, -EUCLEAN); + } + set_extent_buffer_dirty(buf); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + /* + * btrfs_check_leaf() won't check item data if we don't have WRITTEN + * set, so this will only validate the basic structure of the items. + */ + if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(buf)) { + btrfs_print_leaf(buf); + ASSERT(0); + } +#endif +} + +static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info, + int flush_delayed) +{ + /* + * looks as though older kernels can get into trouble with + * this code, they end up stuck in balance_dirty_pages forever + */ + int ret; + + if (current->flags & PF_MEMALLOC) + return; + + if (flush_delayed) + btrfs_balance_delayed_items(fs_info); + + ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH, + fs_info->dirty_metadata_batch); + if (ret > 0) { + balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping); + } +} + +void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info) +{ + __btrfs_btree_balance_dirty(fs_info, 1); +} + +void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info) +{ + __btrfs_btree_balance_dirty(fs_info, 0); +} + +static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) +{ + /* cleanup FS via transaction */ + btrfs_cleanup_transaction(fs_info); + + mutex_lock(&fs_info->cleaner_mutex); + btrfs_run_delayed_iputs(fs_info); + mutex_unlock(&fs_info->cleaner_mutex); + + down_write(&fs_info->cleanup_work_sem); + up_write(&fs_info->cleanup_work_sem); +} + +static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *gang[8]; + u64 root_objectid = 0; + int ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang))) != 0) { + int i; + + for (i = 0; i < ret; i++) + gang[i] = btrfs_grab_root(gang[i]); + spin_unlock(&fs_info->fs_roots_radix_lock); + + for (i = 0; i < ret; i++) { + if (!gang[i]) + continue; + root_objectid = gang[i]->root_key.objectid; + btrfs_free_log(NULL, gang[i]); + btrfs_put_root(gang[i]); + } + root_objectid++; + spin_lock(&fs_info->fs_roots_radix_lock); + } + spin_unlock(&fs_info->fs_roots_radix_lock); + btrfs_free_log_root_tree(NULL, fs_info); +} + +static void btrfs_destroy_ordered_extents(struct btrfs_root *root) +{ + struct btrfs_ordered_extent *ordered; + + spin_lock(&root->ordered_extent_lock); + /* + * This will just short circuit the ordered completion stuff which will + * make sure the ordered extent gets properly cleaned up. + */ + list_for_each_entry(ordered, &root->ordered_extents, + root_extent_list) + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + spin_unlock(&root->ordered_extent_lock); +} + +static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + LIST_HEAD(splice); + + spin_lock(&fs_info->ordered_root_lock); + list_splice_init(&fs_info->ordered_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + ordered_root); + list_move_tail(&root->ordered_root, + &fs_info->ordered_roots); + + spin_unlock(&fs_info->ordered_root_lock); + btrfs_destroy_ordered_extents(root); + + cond_resched(); + spin_lock(&fs_info->ordered_root_lock); + } + spin_unlock(&fs_info->ordered_root_lock); + + /* + * We need this here because if we've been flipped read-only we won't + * get sync() from the umount, so we need to make sure any ordered + * extents that haven't had their dirty pages IO start writeout yet + * actually get run and error out properly. + */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); +} + +static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_fs_info *fs_info) +{ + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + + delayed_refs = &trans->delayed_refs; + + spin_lock(&delayed_refs->lock); + if (atomic_read(&delayed_refs->num_entries) == 0) { + spin_unlock(&delayed_refs->lock); + btrfs_debug(fs_info, "delayed_refs has NO entry"); + return; + } + + while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { + struct btrfs_delayed_ref_head *head; + struct rb_node *n; + bool pin_bytes = false; + + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + if (btrfs_delayed_ref_lock(delayed_refs, head)) + continue; + + spin_lock(&head->lock); + while ((n = rb_first_cached(&head->ref_tree)) != NULL) { + ref = rb_entry(n, struct btrfs_delayed_ref_node, + ref_node); + rb_erase_cached(&ref->ref_node, &head->ref_tree); + RB_CLEAR_NODE(&ref->ref_node); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); + atomic_dec(&delayed_refs->num_entries); + btrfs_put_delayed_ref(ref); + } + if (head->must_insert_reserved) + pin_bytes = true; + btrfs_free_delayed_extent_op(head->extent_op); + btrfs_delete_ref_head(delayed_refs, head); + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + mutex_unlock(&head->mutex); + + if (pin_bytes) { + struct btrfs_block_group *cache; + + cache = btrfs_lookup_block_group(fs_info, head->bytenr); + BUG_ON(!cache); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned += head->num_bytes; + btrfs_space_info_update_bytes_pinned(fs_info, + cache->space_info, head->num_bytes); + cache->reserved -= head->num_bytes; + cache->space_info->bytes_reserved -= head->num_bytes; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + btrfs_put_block_group(cache); + + btrfs_error_unpin_extent_range(fs_info, head->bytenr, + head->bytenr + head->num_bytes - 1); + } + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); + btrfs_put_delayed_ref_head(head); + cond_resched(); + spin_lock(&delayed_refs->lock); + } + btrfs_qgroup_destroy_extent_records(trans); + + spin_unlock(&delayed_refs->lock); +} + +static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) +{ + struct btrfs_inode *btrfs_inode; + LIST_HEAD(splice); + + spin_lock(&root->delalloc_lock); + list_splice_init(&root->delalloc_inodes, &splice); + + while (!list_empty(&splice)) { + struct inode *inode = NULL; + btrfs_inode = list_first_entry(&splice, struct btrfs_inode, + delalloc_inodes); + __btrfs_del_delalloc_inode(root, btrfs_inode); + spin_unlock(&root->delalloc_lock); + + /* + * Make sure we get a live inode and that it'll not disappear + * meanwhile. + */ + inode = igrab(&btrfs_inode->vfs_inode); + if (inode) { + unsigned int nofs_flag; + + nofs_flag = memalloc_nofs_save(); + invalidate_inode_pages2(inode->i_mapping); + memalloc_nofs_restore(nofs_flag); + iput(inode); + } + spin_lock(&root->delalloc_lock); + } + spin_unlock(&root->delalloc_lock); +} + +static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + LIST_HEAD(splice); + + spin_lock(&fs_info->delalloc_root_lock); + list_splice_init(&fs_info->delalloc_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + delalloc_root); + root = btrfs_grab_root(root); + BUG_ON(!root); + spin_unlock(&fs_info->delalloc_root_lock); + + btrfs_destroy_delalloc_inodes(root); + btrfs_put_root(root); + + spin_lock(&fs_info->delalloc_root_lock); + } + spin_unlock(&fs_info->delalloc_root_lock); +} + +static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, + struct extent_io_tree *dirty_pages, + int mark) +{ + struct extent_buffer *eb; + u64 start = 0; + u64 end; + + while (find_first_extent_bit(dirty_pages, start, &start, &end, + mark, NULL)) { + clear_extent_bits(dirty_pages, start, end, mark); + while (start <= end) { + eb = find_extent_buffer(fs_info, start); + start += fs_info->nodesize; + if (!eb) + continue; + + btrfs_tree_lock(eb); + wait_on_extent_buffer_writeback(eb); + btrfs_clear_buffer_dirty(NULL, eb); + btrfs_tree_unlock(eb); + + free_extent_buffer_stale(eb); + } + } +} + +static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, + struct extent_io_tree *unpin) +{ + u64 start; + u64 end; + + while (1) { + struct extent_state *cached_state = NULL; + + /* + * The btrfs_finish_extent_commit() may get the same range as + * ours between find_first_extent_bit and clear_extent_dirty. + * Hence, hold the unused_bg_unpin_mutex to avoid double unpin + * the same extent range. + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); + if (!find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state)) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + break; + } + + clear_extent_dirty(unpin, start, end, &cached_state); + free_extent_state(cached_state); + btrfs_error_unpin_extent_range(fs_info, start, end); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + cond_resched(); + } +} + +static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache) +{ + struct inode *inode; + + inode = cache->io_ctl.inode; + if (inode) { + unsigned int nofs_flag; + + nofs_flag = memalloc_nofs_save(); + invalidate_inode_pages2(inode->i_mapping); + memalloc_nofs_restore(nofs_flag); + + BTRFS_I(inode)->generation = 0; + cache->io_ctl.inode = NULL; + iput(inode); + } + ASSERT(cache->io_ctl.pages == NULL); + btrfs_put_block_group(cache); +} + +void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group *cache; + + spin_lock(&cur_trans->dirty_bgs_lock); + while (!list_empty(&cur_trans->dirty_bgs)) { + cache = list_first_entry(&cur_trans->dirty_bgs, + struct btrfs_block_group, + dirty_list); + + if (!list_empty(&cache->io_list)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + list_del_init(&cache->io_list); + btrfs_cleanup_bg_io(cache); + spin_lock(&cur_trans->dirty_bgs_lock); + } + + list_del_init(&cache->dirty_list); + spin_lock(&cache->lock); + cache->disk_cache_state = BTRFS_DC_ERROR; + spin_unlock(&cache->lock); + + spin_unlock(&cur_trans->dirty_bgs_lock); + btrfs_put_block_group(cache); + btrfs_delayed_refs_rsv_release(fs_info, 1); + spin_lock(&cur_trans->dirty_bgs_lock); + } + spin_unlock(&cur_trans->dirty_bgs_lock); + + /* + * Refer to the definition of io_bgs member for details why it's safe + * to use it without any locking + */ + while (!list_empty(&cur_trans->io_bgs)) { + cache = list_first_entry(&cur_trans->io_bgs, + struct btrfs_block_group, + io_list); + + list_del_init(&cache->io_list); + spin_lock(&cache->lock); + cache->disk_cache_state = BTRFS_DC_ERROR; + spin_unlock(&cache->lock); + btrfs_cleanup_bg_io(cache); + } +} + +static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *gang[8]; + int i; + int ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + while (1) { + ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang), + BTRFS_ROOT_TRANS_TAG); + if (ret == 0) + break; + for (i = 0; i < ret; i++) { + struct btrfs_root *root = gang[i]; + + btrfs_qgroup_free_meta_all_pertrans(root); + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + } + } + spin_unlock(&fs_info->fs_roots_radix_lock); +} + +void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_device *dev, *tmp; + + btrfs_cleanup_dirty_bgs(cur_trans, fs_info); + ASSERT(list_empty(&cur_trans->dirty_bgs)); + ASSERT(list_empty(&cur_trans->io_bgs)); + + list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list, + post_commit_list) { + list_del_init(&dev->post_commit_list); + } + + btrfs_destroy_delayed_refs(cur_trans, fs_info); + + cur_trans->state = TRANS_STATE_COMMIT_START; + wake_up(&fs_info->transaction_blocked_wait); + + cur_trans->state = TRANS_STATE_UNBLOCKED; + wake_up(&fs_info->transaction_wait); + + btrfs_destroy_delayed_inodes(fs_info); + + btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages, + EXTENT_DIRTY); + btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); + + btrfs_free_all_qgroup_pertrans(fs_info); + + cur_trans->state =TRANS_STATE_COMPLETED; + wake_up(&cur_trans->commit_wait); +} + +static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) +{ + struct btrfs_transaction *t; + + mutex_lock(&fs_info->transaction_kthread_mutex); + + spin_lock(&fs_info->trans_lock); + while (!list_empty(&fs_info->trans_list)) { + t = list_first_entry(&fs_info->trans_list, + struct btrfs_transaction, list); + if (t->state >= TRANS_STATE_COMMIT_PREP) { + refcount_inc(&t->use_count); + spin_unlock(&fs_info->trans_lock); + btrfs_wait_for_commit(fs_info, t->transid); + btrfs_put_transaction(t); + spin_lock(&fs_info->trans_lock); + continue; + } + if (t == fs_info->running_transaction) { + t->state = TRANS_STATE_COMMIT_DOING; + spin_unlock(&fs_info->trans_lock); + /* + * We wait for 0 num_writers since we don't hold a trans + * handle open currently for this transaction. + */ + wait_event(t->writer_wait, + atomic_read(&t->num_writers) == 0); + } else { + spin_unlock(&fs_info->trans_lock); + } + btrfs_cleanup_one_transaction(t, fs_info); + + spin_lock(&fs_info->trans_lock); + if (t == fs_info->running_transaction) + fs_info->running_transaction = NULL; + list_del_init(&t->list); + spin_unlock(&fs_info->trans_lock); + + btrfs_put_transaction(t); + trace_btrfs_transaction_commit(fs_info); + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); + btrfs_destroy_all_ordered_extents(fs_info); + btrfs_destroy_delayed_inodes(fs_info); + btrfs_assert_delayed_root_empty(fs_info); + btrfs_destroy_all_delalloc_inodes(fs_info); + btrfs_drop_all_logs(fs_info); + mutex_unlock(&fs_info->transaction_kthread_mutex); + + return 0; +} + +int btrfs_init_root_free_objectid(struct btrfs_root *root) +{ + struct btrfs_path *path; + int ret; + struct extent_buffer *l; + struct btrfs_key search_key; + struct btrfs_key found_key; + int slot; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + search_key.objectid = BTRFS_LAST_FREE_OBJECTID; + search_key.type = -1; + search_key.offset = (u64)-1; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto error; + BUG_ON(ret == 0); /* Corruption */ + if (path->slots[0] > 0) { + slot = path->slots[0] - 1; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + root->free_objectid = max_t(u64, found_key.objectid + 1, + BTRFS_FIRST_FREE_OBJECTID); + } else { + root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) +{ + int ret; + mutex_lock(&root->objectid_mutex); + + if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { + btrfs_warn(root->fs_info, + "the objectid of root %llu reaches its highest value", + root->root_key.objectid); + ret = -ENOSPC; + goto out; + } + + *objectid = root->free_objectid++; + ret = 0; +out: + mutex_unlock(&root->objectid_mutex); + return ret; +} diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h new file mode 100644 index 0000000000..50dab8f639 --- /dev/null +++ b/fs/btrfs/disk-io.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#ifndef BTRFS_DISK_IO_H +#define BTRFS_DISK_IO_H + +#define BTRFS_SUPER_MIRROR_MAX 3 +#define BTRFS_SUPER_MIRROR_SHIFT 12 + +/* + * Fixed blocksize for all devices, applies to specific ways of reading + * metadata like superblock. Must meet the set_blocksize requirements. + * + * Do not change. + */ +#define BTRFS_BDEV_BLOCKSIZE (4096) + +static inline u64 btrfs_sb_offset(int mirror) +{ + u64 start = SZ_16K; + if (mirror) + return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror); + return BTRFS_SUPER_INFO_OFFSET; +} + +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_tree_parent_check; + +void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info); +void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); +struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, + struct btrfs_tree_parent_check *check); +struct extent_buffer *btrfs_find_create_tree_block( + struct btrfs_fs_info *fs_info, + u64 bytenr, u64 owner_root, + int level); +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf); +void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); +int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); +int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *disk_sb); +int __cold open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options); +void __cold close_ctree(struct btrfs_fs_info *fs_info); +int btrfs_validate_super(struct btrfs_fs_info *fs_info, + struct btrfs_super_block *sb, int mirror_num); +int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount); +int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); +struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); +struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, + int copy_num, bool drop_cache); +int btrfs_commit_super(struct btrfs_fs_info *fs_info); +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key); +int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root); +void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); + +struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, + u64 objectid, bool check_ref); +struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, + u64 objectid, dev_t anon_dev); +struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + u64 objectid); +int btrfs_global_root_insert(struct btrfs_root *root); +void btrfs_global_root_delete(struct btrfs_root *root); +struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *key); +struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr); +struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); +struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info); + +void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); +void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); +void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); +void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root); +int btrfs_validate_extent_buffer(struct extent_buffer *eb, + struct btrfs_tree_parent_check *check); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); +#endif + +/* + * This function is used to grab the root, and avoid it is freed when we + * access it. But it doesn't ensure that the tree is not dropped. + * + * If you want to ensure the whole tree is safe, you should use + * fs_info->subvol_srcu + */ +static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) +{ + if (!root) + return NULL; + if (refcount_inc_not_zero(&root->refs)) + return root; + return NULL; +} + +void btrfs_put_root(struct btrfs_root *root); +void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf); +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, + int atomic); +int btrfs_read_extent_buffer(struct extent_buffer *buf, + struct btrfs_tree_parent_check *check); + +blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio); +int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans, + struct btrfs_fs_info *fs_info); +void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, + struct btrfs_fs_info *fs_info); +struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, + u64 objectid); +int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); +int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); +int btrfs_init_root_free_objectid(struct btrfs_root *root); + +#endif diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c new file mode 100644 index 0000000000..744a02b7fd --- /dev/null +++ b/fs/btrfs/export.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "export.h" +#include "accessors.h" +#include "super.h" + +#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ + parent_objectid) / 4) +#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \ + parent_root_objectid) / 4) +#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4) + +static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) +{ + struct btrfs_fid *fid = (struct btrfs_fid *)fh; + int len = *max_len; + int type; + + if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) { + *max_len = BTRFS_FID_SIZE_CONNECTABLE; + return FILEID_INVALID; + } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { + *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE; + return FILEID_INVALID; + } + + len = BTRFS_FID_SIZE_NON_CONNECTABLE; + type = FILEID_BTRFS_WITHOUT_PARENT; + + fid->objectid = btrfs_ino(BTRFS_I(inode)); + fid->root_objectid = BTRFS_I(inode)->root->root_key.objectid; + fid->gen = inode->i_generation; + + if (parent) { + u64 parent_root_id; + + fid->parent_objectid = BTRFS_I(parent)->location.objectid; + fid->parent_gen = parent->i_generation; + parent_root_id = BTRFS_I(parent)->root->root_key.objectid; + + if (parent_root_id != fid->root_objectid) { + fid->parent_root_objectid = parent_root_id; + len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; + type = FILEID_BTRFS_WITH_PARENT_ROOT; + } else { + len = BTRFS_FID_SIZE_CONNECTABLE; + type = FILEID_BTRFS_WITH_PARENT; + } + } + + *max_len = len; + return type; +} + +/* + * Read dentry of inode with @objectid from filesystem root @root_objectid. + * + * @sb: the filesystem super block + * @objectid: inode objectid + * @root_objectid: object id of the subvolume root where to look up the inode + * @generation: optional, if not zero, verify that the found inode + * generation matches + * + * Return dentry alias for the inode, otherwise an error. In case the + * generation does not match return ESTALE. + */ +struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u64 generation) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root; + struct inode *inode; + + if (objectid < BTRFS_FIRST_FREE_OBJECTID) + return ERR_PTR(-ESTALE); + + root = btrfs_get_fs_root(fs_info, root_objectid, true); + if (IS_ERR(root)) + return ERR_CAST(root); + + inode = btrfs_iget(sb, objectid, root); + btrfs_put_root(root); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + if (generation != 0 && generation != inode->i_generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return d_obtain_alias(inode); +} + +static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + if (fh_len < BTRFS_FID_SIZE_CONNECTABLE) + return NULL; + root_objectid = fid->root_objectid; + } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) { + if (fh_len < BTRFS_FID_SIZE_CONNECTABLE_ROOT) + return NULL; + root_objectid = fid->parent_root_objectid; + } else + return NULL; + + objectid = fid->parent_objectid; + generation = fid->parent_gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation); +} + +static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if ((fh_type != FILEID_BTRFS_WITH_PARENT || + fh_len < BTRFS_FID_SIZE_CONNECTABLE) && + (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT || + fh_len < BTRFS_FID_SIZE_CONNECTABLE_ROOT) && + (fh_type != FILEID_BTRFS_WITHOUT_PARENT || + fh_len < BTRFS_FID_SIZE_NON_CONNECTABLE)) + return NULL; + + objectid = fid->objectid; + root_objectid = fid->root_objectid; + generation = fid->gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation); +} + +struct dentry *btrfs_get_parent(struct dentry *child) +{ + struct inode *dir = d_inode(child); + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_root_ref *ref; + struct btrfs_key key; + struct btrfs_key found_key; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = root->root_key.objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + root = fs_info->tree_root; + } else { + key.objectid = btrfs_ino(BTRFS_I(dir)); + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + } + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto fail; + + BUG_ON(ret == 0); /* Key with offset of -1 found */ + if (path->slots[0] == 0) { + ret = -ENOENT; + goto fail; + } + + path->slots[0]--; + leaf = path->nodes[0]; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != key.objectid || found_key.type != key.type) { + ret = -ENOENT; + goto fail; + } + + if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + key.objectid = btrfs_root_ref_dirid(leaf, ref); + } else { + key.objectid = found_key.offset; + } + btrfs_free_path(path); + + if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { + return btrfs_get_dentry(fs_info->sb, key.objectid, + found_key.offset, 0); + } + + return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root)); +fail: + btrfs_free_path(path); + return ERR_PTR(ret); +} + +static int btrfs_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *inode = d_inode(child); + struct inode *dir = d_inode(parent); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode_ref *iref; + struct btrfs_root_ref *rref; + struct extent_buffer *leaf; + unsigned long name_ptr; + struct btrfs_key key; + int name_len; + int ret; + u64 ino; + + if (!S_ISDIR(dir->i_mode)) + return -EINVAL; + + ino = btrfs_ino(BTRFS_I(inode)); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (ino == BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = BTRFS_I(inode)->root->root_key.objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + root = fs_info->tree_root; + } else { + key.objectid = ino; + key.offset = btrfs_ino(BTRFS_I(dir)); + key.type = BTRFS_INODE_REF_KEY; + } + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } else if (ret > 0) { + if (ino == BTRFS_FIRST_FREE_OBJECTID) { + path->slots[0]--; + } else { + btrfs_free_path(path); + return -ENOENT; + } + } + leaf = path->nodes[0]; + + if (ino == BTRFS_FIRST_FREE_OBJECTID) { + rref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + name_ptr = (unsigned long)(rref + 1); + name_len = btrfs_root_ref_name_len(leaf, rref); + } else { + iref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_ref); + name_ptr = (unsigned long)(iref + 1); + name_len = btrfs_inode_ref_name_len(leaf, iref); + } + + read_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_free_path(path); + + /* + * have to add the null termination to make sure that reconnect_path + * gets the right len for strlen + */ + name[name_len] = '\0'; + + return 0; +} + +const struct export_operations btrfs_export_ops = { + .encode_fh = btrfs_encode_fh, + .fh_to_dentry = btrfs_fh_to_dentry, + .fh_to_parent = btrfs_fh_to_parent, + .get_parent = btrfs_get_parent, + .get_name = btrfs_get_name, +}; diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h new file mode 100644 index 0000000000..eba6bc4f5a --- /dev/null +++ b/fs/btrfs/export.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_EXPORT_H +#define BTRFS_EXPORT_H + +#include + +extern const struct export_operations btrfs_export_ops; + +struct btrfs_fid { + u64 objectid; + u64 root_objectid; + u32 gen; + + u64 parent_objectid; + u32 parent_gen; + + u64 parent_root_objectid; +} __attribute__ ((packed)); + +struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u64 generation); +struct dentry *btrfs_get_parent(struct dentry *child); + +#endif diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c new file mode 100644 index 0000000000..ff8e117a1a --- /dev/null +++ b/fs/btrfs/extent-io-tree.c @@ -0,0 +1,1779 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "messages.h" +#include "ctree.h" +#include "extent-io-tree.h" +#include "btrfs_inode.h" +#include "misc.h" + +static struct kmem_cache *extent_state_cache; + +static inline bool extent_state_in_tree(const struct extent_state *state) +{ + return !RB_EMPTY_NODE(&state->rb_node); +} + +#ifdef CONFIG_BTRFS_DEBUG +static LIST_HEAD(states); +static DEFINE_SPINLOCK(leak_lock); + +static inline void btrfs_leak_debug_add_state(struct extent_state *state) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_add(&state->leak_list, &states); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline void btrfs_leak_debug_del_state(struct extent_state *state) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_del(&state->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline void btrfs_extent_state_leak_debug_check(void) +{ + struct extent_state *state; + + while (!list_empty(&states)) { + state = list_entry(states.next, struct extent_state, leak_list); + pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", + state->start, state->end, state->state, + extent_state_in_tree(state), + refcount_read(&state->refs)); + list_del(&state->leak_list); + kmem_cache_free(extent_state_cache, state); + } +} + +#define btrfs_debug_check_extent_io_range(tree, start, end) \ + __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) +static inline void __btrfs_debug_check_extent_io_range(const char *caller, + struct extent_io_tree *tree, + u64 start, u64 end) +{ + struct btrfs_inode *inode = tree->inode; + u64 isize; + + if (!inode) + return; + + isize = i_size_read(&inode->vfs_inode); + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + btrfs_debug_rl(inode->root->fs_info, + "%s: ino %llu isize %llu odd range [%llu,%llu]", + caller, btrfs_ino(inode), isize, start, end); + } +} +#else +#define btrfs_leak_debug_add_state(state) do {} while (0) +#define btrfs_leak_debug_del_state(state) do {} while (0) +#define btrfs_extent_state_leak_debug_check() do {} while (0) +#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) +#endif + +/* + * For the file_extent_tree, we want to hold the inode lock when we lookup and + * update the disk_i_size, but lockdep will complain because our io_tree we hold + * the tree lock and get the inode lock when setting delalloc. These two things + * are unrelated, so make a class for the file_extent_tree so we don't get the + * two locking patterns mixed up. + */ +static struct lock_class_key file_extent_tree_class; + +struct tree_entry { + u64 start; + u64 end; + struct rb_node rb_node; +}; + +void extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner) +{ + tree->fs_info = fs_info; + tree->state = RB_ROOT; + spin_lock_init(&tree->lock); + tree->inode = NULL; + tree->owner = owner; + if (owner == IO_TREE_INODE_FILE_EXTENT) + lockdep_set_class(&tree->lock, &file_extent_tree_class); +} + +void extent_io_tree_release(struct extent_io_tree *tree) +{ + spin_lock(&tree->lock); + /* + * Do a single barrier for the waitqueue_active check here, the state + * of the waitqueue should not change once extent_io_tree_release is + * called. + */ + smp_mb(); + while (!RB_EMPTY_ROOT(&tree->state)) { + struct rb_node *node; + struct extent_state *state; + + node = rb_first(&tree->state); + state = rb_entry(node, struct extent_state, rb_node); + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + /* + * btree io trees aren't supposed to have tasks waiting for + * changes in the flags of extent states ever. + */ + ASSERT(!waitqueue_active(&state->wq)); + free_extent_state(state); + + cond_resched_lock(&tree->lock); + } + spin_unlock(&tree->lock); +} + +static struct extent_state *alloc_extent_state(gfp_t mask) +{ + struct extent_state *state; + + /* + * The given mask might be not appropriate for the slab allocator, + * drop the unsupported bits + */ + mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); + state = kmem_cache_alloc(extent_state_cache, mask); + if (!state) + return state; + state->state = 0; + RB_CLEAR_NODE(&state->rb_node); + btrfs_leak_debug_add_state(state); + refcount_set(&state->refs, 1); + init_waitqueue_head(&state->wq); + trace_alloc_extent_state(state, mask, _RET_IP_); + return state; +} + +static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc) +{ + if (!prealloc) + prealloc = alloc_extent_state(GFP_ATOMIC); + + return prealloc; +} + +void free_extent_state(struct extent_state *state) +{ + if (!state) + return; + if (refcount_dec_and_test(&state->refs)) { + WARN_ON(extent_state_in_tree(state)); + btrfs_leak_debug_del_state(state); + trace_free_extent_state(state, _RET_IP_); + kmem_cache_free(extent_state_cache, state); + } +} + +static int add_extent_changeset(struct extent_state *state, u32 bits, + struct extent_changeset *changeset, + int set) +{ + int ret; + + if (!changeset) + return 0; + if (set && (state->state & bits) == bits) + return 0; + if (!set && (state->state & bits) == 0) + return 0; + changeset->bytes_changed += state->end - state->start + 1; + ret = ulist_add(&changeset->range_changed, state->start, state->end, + GFP_ATOMIC); + return ret; +} + +static inline struct extent_state *next_state(struct extent_state *state) +{ + struct rb_node *next = rb_next(&state->rb_node); + + if (next) + return rb_entry(next, struct extent_state, rb_node); + else + return NULL; +} + +static inline struct extent_state *prev_state(struct extent_state *state) +{ + struct rb_node *next = rb_prev(&state->rb_node); + + if (next) + return rb_entry(next, struct extent_state, rb_node); + else + return NULL; +} + +/* + * Search @tree for an entry that contains @offset. Such entry would have + * entry->start <= offset && entry->end >= offset. + * + * @tree: the tree to search + * @offset: offset that should fall within an entry in @tree + * @node_ret: pointer where new node should be anchored (used when inserting an + * entry in the tree) + * @parent_ret: points to entry which would have been the parent of the entry, + * containing @offset + * + * Return a pointer to the entry that contains @offset byte address and don't change + * @node_ret and @parent_ret. + * + * If no such entry exists, return pointer to entry that ends before @offset + * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. + */ +static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree, + u64 offset, + struct rb_node ***node_ret, + struct rb_node **parent_ret) +{ + struct rb_root *root = &tree->state; + struct rb_node **node = &root->rb_node; + struct rb_node *prev = NULL; + struct extent_state *entry = NULL; + + while (*node) { + prev = *node; + entry = rb_entry(prev, struct extent_state, rb_node); + + if (offset < entry->start) + node = &(*node)->rb_left; + else if (offset > entry->end) + node = &(*node)->rb_right; + else + return entry; + } + + if (node_ret) + *node_ret = node; + if (parent_ret) + *parent_ret = prev; + + /* Search neighbors until we find the first one past the end */ + while (entry && offset > entry->end) + entry = next_state(entry); + + return entry; +} + +/* + * Search offset in the tree or fill neighbor rbtree node pointers. + * + * @tree: the tree to search + * @offset: offset that should fall within an entry in @tree + * @next_ret: pointer to the first entry whose range ends after @offset + * @prev_ret: pointer to the first entry whose range begins before @offset + * + * Return a pointer to the entry that contains @offset byte address. If no + * such entry exists, then return NULL and fill @prev_ret and @next_ret. + * Otherwise return the found entry and other pointers are left untouched. + */ +static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree, + u64 offset, + struct extent_state **prev_ret, + struct extent_state **next_ret) +{ + struct rb_root *root = &tree->state; + struct rb_node **node = &root->rb_node; + struct extent_state *orig_prev; + struct extent_state *entry = NULL; + + ASSERT(prev_ret); + ASSERT(next_ret); + + while (*node) { + entry = rb_entry(*node, struct extent_state, rb_node); + + if (offset < entry->start) + node = &(*node)->rb_left; + else if (offset > entry->end) + node = &(*node)->rb_right; + else + return entry; + } + + orig_prev = entry; + while (entry && offset > entry->end) + entry = next_state(entry); + *next_ret = entry; + entry = orig_prev; + + while (entry && offset < entry->start) + entry = prev_state(entry); + *prev_ret = entry; + + return NULL; +} + +/* + * Inexact rb-tree search, return the next entry if @offset is not found + */ +static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset) +{ + return tree_search_for_insert(tree, offset, NULL, NULL); +} + +static void extent_io_tree_panic(struct extent_io_tree *tree, int err) +{ + btrfs_panic(tree->fs_info, err, + "locking error: extent tree was modified by another thread while locked"); +} + +/* + * Utility function to look for merge candidates inside a given range. Any + * extents with matching state are merged together into a single extent in the + * tree. Extents with EXTENT_IO in their state field are not merged because + * the end_io handlers need to be able to do operations on them without + * sleeping (or doing allocations/splits). + * + * This should be called with the tree lock held. + */ +static void merge_state(struct extent_io_tree *tree, struct extent_state *state) +{ + struct extent_state *other; + + if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + return; + + other = prev_state(state); + if (other && other->end == state->start - 1 && + other->state == state->state) { + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, state, other); + state->start = other->start; + rb_erase(&other->rb_node, &tree->state); + RB_CLEAR_NODE(&other->rb_node); + free_extent_state(other); + } + other = next_state(state); + if (other && other->start == state->end + 1 && + other->state == state->state) { + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, state, other); + state->end = other->end; + rb_erase(&other->rb_node, &tree->state); + RB_CLEAR_NODE(&other->rb_node); + free_extent_state(other); + } +} + +static void set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, + u32 bits, struct extent_changeset *changeset) +{ + u32 bits_to_set = bits & ~EXTENT_CTLBITS; + int ret; + + if (tree->inode) + btrfs_set_delalloc_extent(tree->inode, state, bits); + + ret = add_extent_changeset(state, bits_to_set, changeset, 1); + BUG_ON(ret < 0); + state->state |= bits_to_set; +} + +/* + * Insert an extent_state struct into the tree. 'bits' are set on the + * struct before it is inserted. + * + * This may return -EEXIST if the extent is already there, in which case the + * state struct is freed. + * + * The tree lock is not taken internally. This is a utility function and + * probably isn't what you want to call (see set/clear_extent_bit). + */ +static int insert_state(struct extent_io_tree *tree, + struct extent_state *state, + u32 bits, struct extent_changeset *changeset) +{ + struct rb_node **node; + struct rb_node *parent = NULL; + const u64 end = state->end; + + set_state_bits(tree, state, bits, changeset); + + node = &tree->state.rb_node; + while (*node) { + struct extent_state *entry; + + parent = *node; + entry = rb_entry(parent, struct extent_state, rb_node); + + if (end < entry->start) { + node = &(*node)->rb_left; + } else if (end > entry->end) { + node = &(*node)->rb_right; + } else { + btrfs_err(tree->fs_info, + "found node %llu %llu on insert of %llu %llu", + entry->start, entry->end, state->start, end); + return -EEXIST; + } + } + + rb_link_node(&state->rb_node, parent, node); + rb_insert_color(&state->rb_node, &tree->state); + + merge_state(tree, state); + return 0; +} + +/* + * Insert state to @tree to the location given by @node and @parent. + */ +static void insert_state_fast(struct extent_io_tree *tree, + struct extent_state *state, struct rb_node **node, + struct rb_node *parent, unsigned bits, + struct extent_changeset *changeset) +{ + set_state_bits(tree, state, bits, changeset); + rb_link_node(&state->rb_node, parent, node); + rb_insert_color(&state->rb_node, &tree->state); + merge_state(tree, state); +} + +/* + * Split a given extent state struct in two, inserting the preallocated + * struct 'prealloc' as the newly created second half. 'split' indicates an + * offset inside 'orig' where it should be split. + * + * Before calling, + * the tree has 'orig' at [orig->start, orig->end]. After calling, there + * are two extent state structs in the tree: + * prealloc: [orig->start, split - 1] + * orig: [ split, orig->end ] + * + * The tree locks are not taken by this function. They need to be held + * by the caller. + */ +static int split_state(struct extent_io_tree *tree, struct extent_state *orig, + struct extent_state *prealloc, u64 split) +{ + struct rb_node *parent = NULL; + struct rb_node **node; + + if (tree->inode) + btrfs_split_delalloc_extent(tree->inode, orig, split); + + prealloc->start = orig->start; + prealloc->end = split - 1; + prealloc->state = orig->state; + orig->start = split; + + parent = &orig->rb_node; + node = &parent; + while (*node) { + struct extent_state *entry; + + parent = *node; + entry = rb_entry(parent, struct extent_state, rb_node); + + if (prealloc->end < entry->start) { + node = &(*node)->rb_left; + } else if (prealloc->end > entry->end) { + node = &(*node)->rb_right; + } else { + free_extent_state(prealloc); + return -EEXIST; + } + } + + rb_link_node(&prealloc->rb_node, parent, node); + rb_insert_color(&prealloc->rb_node, &tree->state); + + return 0; +} + +/* + * Utility function to clear some bits in an extent state struct. It will + * optionally wake up anyone waiting on this state (wake == 1). + * + * If no bits are set on the state struct after clearing things, the + * struct is freed and removed from the tree + */ +static struct extent_state *clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, + u32 bits, int wake, + struct extent_changeset *changeset) +{ + struct extent_state *next; + u32 bits_to_clear = bits & ~EXTENT_CTLBITS; + int ret; + + if (tree->inode) + btrfs_clear_delalloc_extent(tree->inode, state, bits); + + ret = add_extent_changeset(state, bits_to_clear, changeset, 0); + BUG_ON(ret < 0); + state->state &= ~bits_to_clear; + if (wake) + wake_up(&state->wq); + if (state->state == 0) { + next = next_state(state); + if (extent_state_in_tree(state)) { + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + free_extent_state(state); + } else { + WARN_ON(1); + } + } else { + merge_state(tree, state); + next = next_state(state); + } + return next; +} + +/* + * Detect if extent bits request NOWAIT semantics and set the gfp mask accordingly, + * unset the EXTENT_NOWAIT bit. + */ +static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask) +{ + *mask = (*bits & EXTENT_NOWAIT ? GFP_NOWAIT : GFP_NOFS); + *bits &= EXTENT_NOWAIT - 1; +} + +/* + * Clear some bits on a range in the tree. This may require splitting or + * inserting elements in the tree, so the gfp mask is used to indicate which + * allocations or sleeping are allowed. + * + * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given + * range from the tree regardless of state (ie for truncate). + * + * The range [start, end] is inclusive. + * + * This takes the tree lock, and returns 0 on success and < 0 on error. + */ +int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state, + struct extent_changeset *changeset) +{ + struct extent_state *state; + struct extent_state *cached; + struct extent_state *prealloc = NULL; + u64 last_end; + int err; + int clear = 0; + int wake; + int delete = (bits & EXTENT_CLEAR_ALL_BITS); + gfp_t mask; + + set_gfp_mask_from_bits(&bits, &mask); + btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); + + if (delete) + bits |= ~EXTENT_CTLBITS; + + if (bits & EXTENT_DELALLOC) + bits |= EXTENT_NORESERVE; + + wake = (bits & EXTENT_LOCKED) ? 1 : 0; + if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + clear = 1; +again: + if (!prealloc) { + /* + * Don't care for allocation failure here because we might end + * up not needing the pre-allocated extent state at all, which + * is the case if we only have in the tree extent states that + * cover our input range and don't cover too any other range. + * If we end up needing a new extent state we allocate it later. + */ + prealloc = alloc_extent_state(mask); + } + + spin_lock(&tree->lock); + if (cached_state) { + cached = *cached_state; + + if (clear) { + *cached_state = NULL; + cached_state = NULL; + } + + if (cached && extent_state_in_tree(cached) && + cached->start <= start && cached->end > start) { + if (clear) + refcount_dec(&cached->refs); + state = cached; + goto hit_next; + } + if (clear) + free_extent_state(cached); + } + + /* This search will find the extents that end after our range starts. */ + state = tree_search(tree, start); + if (!state) + goto out; +hit_next: + if (state->start > end) + goto out; + WARN_ON(state->end < start); + last_end = state->end; + + /* The state doesn't have the wanted bits, go ahead. */ + if (!(state->state & bits)) { + state = next_state(state); + goto next; + } + + /* + * | ---- desired range ---- | + * | state | or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on second + * half. + * + * If the extent we found extends past our range, we just split and + * search again. It'll get split again the next time though. + * + * If the extent we found is inside our range, we clear the desired bit + * on it. + */ + + if (state->start < start) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + goto search_again; + err = split_state(tree, state, prealloc, start); + if (err) + extent_io_tree_panic(tree, err); + + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + state = clear_state_bit(tree, state, bits, wake, changeset); + goto next; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and clear the bit on the first half. + */ + if (state->start <= end && state->end > end) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + goto search_again; + err = split_state(tree, state, prealloc, end + 1); + if (err) + extent_io_tree_panic(tree, err); + + if (wake) + wake_up(&state->wq); + + clear_state_bit(tree, prealloc, bits, wake, changeset); + + prealloc = NULL; + goto out; + } + + state = clear_state_bit(tree, state, bits, wake, changeset); +next: + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start <= end && state && !need_resched()) + goto hit_next; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (gfpflags_allow_blocking(mask)) + cond_resched(); + goto again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return 0; + +} + +static void wait_on_state(struct extent_io_tree *tree, + struct extent_state *state) + __releases(tree->lock) + __acquires(tree->lock) +{ + DEFINE_WAIT(wait); + prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&tree->lock); + schedule(); + spin_lock(&tree->lock); + finish_wait(&state->wq, &wait); +} + +/* + * Wait for one or more bits to clear on a range in the state tree. + * The range [start, end] is inclusive. + * The tree lock is taken by this function + */ +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state) +{ + struct extent_state *state; + + btrfs_debug_check_extent_io_range(tree, start, end); + + spin_lock(&tree->lock); +again: + /* + * Maintain cached_state, as we may not remove it from the tree if there + * are more bits than the bits we're waiting on set on this state. + */ + if (cached_state && *cached_state) { + state = *cached_state; + if (extent_state_in_tree(state) && + state->start <= start && start < state->end) + goto process_node; + } + while (1) { + /* + * This search will find all the extents that end after our + * range starts. + */ + state = tree_search(tree, start); +process_node: + if (!state) + break; + if (state->start > end) + goto out; + + if (state->state & bits) { + start = state->start; + refcount_inc(&state->refs); + wait_on_state(tree, state); + free_extent_state(state); + goto again; + } + start = state->end + 1; + + if (start > end) + break; + + if (!cond_resched_lock(&tree->lock)) { + state = next_state(state); + goto process_node; + } + } +out: + /* This state is no longer useful, clear it and free it up. */ + if (cached_state && *cached_state) { + state = *cached_state; + *cached_state = NULL; + free_extent_state(state); + } + spin_unlock(&tree->lock); +} + +static void cache_state_if_flags(struct extent_state *state, + struct extent_state **cached_ptr, + unsigned flags) +{ + if (cached_ptr && !(*cached_ptr)) { + if (!flags || (state->state & flags)) { + *cached_ptr = state; + refcount_inc(&state->refs); + } + } +} + +static void cache_state(struct extent_state *state, + struct extent_state **cached_ptr) +{ + return cache_state_if_flags(state, cached_ptr, + EXTENT_LOCKED | EXTENT_BOUNDARY); +} + +/* + * Find the first state struct with 'bits' set after 'start', and return it. + * tree->lock must be held. NULL will returned if nothing was found after + * 'start'. + */ +static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, u32 bits) +{ + struct extent_state *state; + + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search(tree, start); + while (state) { + if (state->end >= start && (state->state & bits)) + return state; + state = next_state(state); + } + return NULL; +} + +/* + * Find the first offset in the io tree with one or more @bits set. + * + * Note: If there are multiple bits set in @bits, any of them will match. + * + * Return true if we find something, and update @start_ret and @end_ret. + * Return false if we found nothing. + */ +bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state) +{ + struct extent_state *state; + bool ret = false; + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->end == start - 1 && extent_state_in_tree(state)) { + while ((state = next_state(state)) != NULL) { + if (state->state & bits) + goto got_it; + } + free_extent_state(*cached_state); + *cached_state = NULL; + goto out; + } + free_extent_state(*cached_state); + *cached_state = NULL; + } + + state = find_first_extent_bit_state(tree, start, bits); +got_it: + if (state) { + cache_state_if_flags(state, cached_state, 0); + *start_ret = state->start; + *end_ret = state->end; + ret = true; + } +out: + spin_unlock(&tree->lock); + return ret; +} + +/* + * Find a contiguous area of bits + * + * @tree: io tree to check + * @start: offset to start the search from + * @start_ret: the first offset we found with the bits set + * @end_ret: the final contiguous range of the bits that were set + * @bits: bits to look for + * + * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges + * to set bits appropriately, and then merge them again. During this time it + * will drop the tree->lock, so use this helper if you want to find the actual + * contiguous area for given bits. We will search to the first bit we find, and + * then walk down the tree until we find a non-contiguous area. The area + * returned will be the full contiguous area with the bits set. + */ +int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits) +{ + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, start, bits); + if (state) { + *start_ret = state->start; + *end_ret = state->end; + while ((state = next_state(state)) != NULL) { + if (state->start > (*end_ret + 1)) + break; + *end_ret = state->end; + } + ret = 0; + } + spin_unlock(&tree->lock); + return ret; +} + +/* + * Find a contiguous range of bytes in the file marked as delalloc, not more + * than 'max_bytes'. start and end are used to return the range, + * + * True is returned if we find something, false if nothing was in the tree. + */ +bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, + u64 *end, u64 max_bytes, + struct extent_state **cached_state) +{ + struct extent_state *state; + u64 cur_start = *start; + bool found = false; + u64 total_bytes = 0; + + spin_lock(&tree->lock); + + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search(tree, cur_start); + if (!state) { + *end = (u64)-1; + goto out; + } + + while (state) { + if (found && (state->start != cur_start || + (state->state & EXTENT_BOUNDARY))) { + goto out; + } + if (!(state->state & EXTENT_DELALLOC)) { + if (!found) + *end = state->end; + goto out; + } + if (!found) { + *start = state->start; + *cached_state = state; + refcount_inc(&state->refs); + } + found = true; + *end = state->end; + cur_start = state->end + 1; + total_bytes += state->end - state->start + 1; + if (total_bytes >= max_bytes) + break; + state = next_state(state); + } +out: + spin_unlock(&tree->lock); + return found; +} + +/* + * Set some bits on a range in the tree. This may require allocations or + * sleeping. By default all allocations use GFP_NOFS, use EXTENT_NOWAIT for + * GFP_NOWAIT. + * + * If any of the exclusive bits are set, this will fail with -EEXIST if some + * part of the range already has the desired bits set. The extent_state of the + * existing range is returned in failed_state in this case, and the start of the + * existing range is returned in failed_start. failed_state is used as an + * optimization for wait_extent_bit, failed_start must be used as the source of + * truth as failed_state may have changed since we returned. + * + * [start, end] is inclusive This takes the tree lock. + */ +static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u64 *failed_start, + struct extent_state **failed_state, + struct extent_state **cached_state, + struct extent_changeset *changeset) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node **p = NULL; + struct rb_node *parent = NULL; + int err = 0; + u64 last_start; + u64 last_end; + u32 exclusive_bits = (bits & EXTENT_LOCKED); + gfp_t mask; + + set_gfp_mask_from_bits(&bits, &mask); + btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); + + if (exclusive_bits) + ASSERT(failed_start); + else + ASSERT(failed_start == NULL && failed_state == NULL); +again: + if (!prealloc) { + /* + * Don't care for allocation failure here because we might end + * up not needing the pre-allocated extent state at all, which + * is the case if we only have in the tree extent states that + * cover our input range and don't cover too any other range. + * If we end up needing a new extent state we allocate it later. + */ + prealloc = alloc_extent_state(mask); + } + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start <= start && state->end > start && + extent_state_in_tree(state)) + goto hit_next; + } + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search_for_insert(tree, start, &p, &parent); + if (!state) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + goto search_again; + prealloc->start = start; + prealloc->end = end; + insert_state_fast(tree, prealloc, p, parent, bits, changeset); + cache_state(prealloc, cached_state); + prealloc = NULL; + goto out; + } +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + if (state->state & exclusive_bits) { + *failed_start = state->start; + cache_state(state, failed_state); + err = -EEXIST; + goto out; + } + + set_state_bits(tree, state, bits, changeset); + cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on second + * half. + * + * If the extent we found extends past our range, we just split and + * search again. It'll get split again the next time though. + * + * If the extent we found is inside our range, we set the desired bit + * on it. + */ + if (state->start < start) { + if (state->state & exclusive_bits) { + *failed_start = start; + cache_state(state, failed_state); + err = -EEXIST; + goto out; + } + + /* + * If this extent already has all the bits we want set, then + * skip it, not necessary to split it or do anything with it. + */ + if ((state->state & bits) == bits) { + start = state->end + 1; + cache_state(state, cached_state); + goto search_again; + } + + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + goto search_again; + err = split_state(tree, state, prealloc, start); + if (err) + extent_io_tree_panic(tree, err); + + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, bits, changeset); + cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and ignore the + * extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + goto search_again; + + /* + * Avoid to free 'prealloc' if it can be merged with the later + * extent. + */ + prealloc->start = start; + prealloc->end = this_end; + err = insert_state(tree, prealloc, bits, changeset); + if (err) + extent_io_tree_panic(tree, err); + + cache_state(prealloc, cached_state); + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * + * We need to split the extent, and set the bit on the first half + */ + if (state->start <= end && state->end > end) { + if (state->state & exclusive_bits) { + *failed_start = start; + cache_state(state, failed_state); + err = -EEXIST; + goto out; + } + + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + goto search_again; + err = split_state(tree, state, prealloc, end + 1); + if (err) + extent_io_tree_panic(tree, err); + + set_state_bits(tree, prealloc, bits, changeset); + cache_state(prealloc, cached_state); + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (gfpflags_allow_blocking(mask)) + cond_resched(); + goto again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +} + +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state) +{ + return __set_extent_bit(tree, start, end, bits, NULL, NULL, + cached_state, NULL); +} + +/* + * Convert all bits in a given range from one bit to another + * + * @tree: the io tree to search + * @start: the start offset in bytes + * @end: the end offset in bytes (inclusive) + * @bits: the bits to set in this range + * @clear_bits: the bits to clear in this range + * @cached_state: state that we're going to cache + * + * This will go through and set bits for the given range. If any states exist + * already in this range they are set with the given bit and cleared of the + * clear_bits. This is only meant to be used by things that are mergeable, ie. + * converting from say DELALLOC to DIRTY. This is not meant to be used with + * boundary bits like LOCK. + * + * All allocations are done with GFP_NOFS. + */ +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u32 clear_bits, + struct extent_state **cached_state) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node **p = NULL; + struct rb_node *parent = NULL; + int err = 0; + u64 last_start; + u64 last_end; + bool first_iteration = true; + + btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits, + clear_bits); + +again: + if (!prealloc) { + /* + * Best effort, don't worry if extent state allocation fails + * here for the first iteration. We might have a cached state + * that matches exactly the target range, in which case no + * extent state allocations are needed. We'll only know this + * after locking the tree. + */ + prealloc = alloc_extent_state(GFP_NOFS); + if (!prealloc && !first_iteration) + return -ENOMEM; + } + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start <= start && state->end > start && + extent_state_in_tree(state)) + goto hit_next; + } + + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search_for_insert(tree, start, &p, &parent); + if (!state) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + prealloc->start = start; + prealloc->end = end; + insert_state_fast(tree, prealloc, p, parent, bits, NULL); + cache_state(prealloc, cached_state); + prealloc = NULL; + goto out; + } +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going. + */ + if (state->start == start && state->end <= end) { + set_state_bits(tree, state, bits, NULL); + cache_state(state, cached_state); + state = clear_state_bit(tree, state, clear_bits, 0, NULL); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on second + * half. + * + * If the extent we found extends past our range, we just split and + * search again. It'll get split again the next time though. + * + * If the extent we found is inside our range, we set the desired bit + * on it. + */ + if (state->start < start) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + err = split_state(tree, state, prealloc, start); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, bits, NULL); + cache_state(state, cached_state); + state = clear_state_bit(tree, state, clear_bits, 0, NULL); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and ignore the + * extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + /* + * Avoid to free 'prealloc' if it can be merged with the later + * extent. + */ + prealloc->start = start; + prealloc->end = this_end; + err = insert_state(tree, prealloc, bits, NULL); + if (err) + extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * + * We need to split the extent, and set the bit on the first half. + */ + if (state->start <= end && state->end > end) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + err = split_state(tree, state, prealloc, end + 1); + if (err) + extent_io_tree_panic(tree, err); + + set_state_bits(tree, prealloc, bits, NULL); + cache_state(prealloc, cached_state); + clear_state_bit(tree, prealloc, clear_bits, 0, NULL); + prealloc = NULL; + goto out; + } + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + cond_resched(); + first_iteration = false; + goto again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; +} + +/* + * Find the first range that has @bits not set. This range could start before + * @start. + * + * @tree: the tree to search + * @start: offset at/after which the found extent should start + * @start_ret: records the beginning of the range + * @end_ret: records the end of the range (inclusive) + * @bits: the set of bits which must be unset + * + * Since unallocated range is also considered one which doesn't have the bits + * set it's possible that @end_ret contains -1, this happens in case the range + * spans (last_range_end, end of device]. In this case it's up to the caller to + * trim @end_ret to the appropriate size. + */ +void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits) +{ + struct extent_state *state; + struct extent_state *prev = NULL, *next = NULL; + + spin_lock(&tree->lock); + + /* Find first extent with bits cleared */ + while (1) { + state = tree_search_prev_next(tree, start, &prev, &next); + if (!state && !next && !prev) { + /* + * Tree is completely empty, send full range and let + * caller deal with it + */ + *start_ret = 0; + *end_ret = -1; + goto out; + } else if (!state && !next) { + /* + * We are past the last allocated chunk, set start at + * the end of the last extent. + */ + *start_ret = prev->end + 1; + *end_ret = -1; + goto out; + } else if (!state) { + state = next; + } + + /* + * At this point 'state' either contains 'start' or start is + * before 'state' + */ + if (in_range(start, state->start, state->end - state->start + 1)) { + if (state->state & bits) { + /* + * |--range with bits sets--| + * | + * start + */ + start = state->end + 1; + } else { + /* + * 'start' falls within a range that doesn't + * have the bits set, so take its start as the + * beginning of the desired range + * + * |--range with bits cleared----| + * | + * start + */ + *start_ret = state->start; + break; + } + } else { + /* + * |---prev range---|---hole/unset---|---node range---| + * | + * start + * + * or + * + * |---hole/unset--||--first node--| + * 0 | + * start + */ + if (prev) + *start_ret = prev->end + 1; + else + *start_ret = 0; + break; + } + } + + /* + * Find the longest stretch from start until an entry which has the + * bits set + */ + while (state) { + if (state->end >= start && !(state->state & bits)) { + *end_ret = state->end; + } else { + *end_ret = state->start - 1; + break; + } + state = next_state(state); + } +out: + spin_unlock(&tree->lock); +} + +/* + * Count the number of bytes in the tree that have a given bit(s) set for a + * given range. + * + * @tree: The io tree to search. + * @start: The start offset of the range. This value is updated to the + * offset of the first byte found with the given bit(s), so it + * can end up being bigger than the initial value. + * @search_end: The end offset (inclusive value) of the search range. + * @max_bytes: The maximum byte count we are interested. The search stops + * once it reaches this count. + * @bits: The bits the range must have in order to be accounted for. + * If multiple bits are set, then only subranges that have all + * the bits set are accounted for. + * @contig: Indicate if we should ignore holes in the range or not. If + * this is true, then stop once we find a hole. + * @cached_state: A cached state to be used across multiple calls to this + * function in order to speedup searches. Use NULL if this is + * called only once or if each call does not start where the + * previous one ended. + * + * Returns the total number of bytes found within the given range that have + * all given bits set. If the returned number of bytes is greater than zero + * then @start is updated with the offset of the first byte with the bits set. + */ +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, u64 max_bytes, + u32 bits, int contig, + struct extent_state **cached_state) +{ + struct extent_state *state = NULL; + struct extent_state *cached; + u64 cur_start = *start; + u64 total_bytes = 0; + u64 last = 0; + int found = 0; + + if (WARN_ON(search_end < cur_start)) + return 0; + + spin_lock(&tree->lock); + + if (!cached_state || !*cached_state) + goto search; + + cached = *cached_state; + + if (!extent_state_in_tree(cached)) + goto search; + + if (cached->start <= cur_start && cur_start <= cached->end) { + state = cached; + } else if (cached->start > cur_start) { + struct extent_state *prev; + + /* + * The cached state starts after our search range's start. Check + * if the previous state record starts at or before the range we + * are looking for, and if so, use it - this is a common case + * when there are holes between records in the tree. If there is + * no previous state record, we can start from our cached state. + */ + prev = prev_state(cached); + if (!prev) + state = cached; + else if (prev->start <= cur_start && cur_start <= prev->end) + state = prev; + } + + /* + * This search will find all the extents that end after our range + * starts. + */ +search: + if (!state) + state = tree_search(tree, cur_start); + + while (state) { + if (state->start > search_end) + break; + if (contig && found && state->start > last + 1) + break; + if (state->end >= cur_start && (state->state & bits) == bits) { + total_bytes += min(search_end, state->end) + 1 - + max(cur_start, state->start); + if (total_bytes >= max_bytes) + break; + if (!found) { + *start = max(cur_start, state->start); + found = 1; + } + last = state->end; + } else if (contig && found) { + break; + } + state = next_state(state); + } + + if (cached_state) { + free_extent_state(*cached_state); + *cached_state = state; + if (state) + refcount_inc(&state->refs); + } + + spin_unlock(&tree->lock); + + return total_bytes; +} + +/* + * Search a range in the state tree for a given mask. If 'filled' == 1, this + * returns 1 only if every extent in the tree has the bits set. Otherwise, 1 + * is returned if any bit in the range is found set. + */ +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, int filled, struct extent_state *cached) +{ + struct extent_state *state = NULL; + int bitset = 0; + + spin_lock(&tree->lock); + if (cached && extent_state_in_tree(cached) && cached->start <= start && + cached->end > start) + state = cached; + else + state = tree_search(tree, start); + while (state && start <= end) { + if (filled && state->start > start) { + bitset = 0; + break; + } + + if (state->start > end) + break; + + if (state->state & bits) { + bitset = 1; + if (!filled) + break; + } else if (filled) { + bitset = 0; + break; + } + + if (state->end == (u64)-1) + break; + + start = state->end + 1; + if (start > end) + break; + state = next_state(state); + } + + /* We ran out of states and were still inside of our range. */ + if (filled && !state) + bitset = 0; + spin_unlock(&tree->lock); + return bitset; +} + +/* Wrappers around set/clear extent bit */ +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset) +{ + /* + * We don't support EXTENT_LOCKED yet, as current changeset will + * record any bits changed, so for EXTENT_LOCKED case, it will + * either fail with -EEXIST or changeset will record the whole + * range. + */ + ASSERT(!(bits & EXTENT_LOCKED)); + + return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); +} + +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset) +{ + /* + * Don't support EXTENT_LOCKED case, same reason as + * set_record_extent_bits(). + */ + ASSERT(!(bits & EXTENT_LOCKED)); + + return __clear_extent_bit(tree, start, end, bits, NULL, changeset); +} + +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) +{ + int err; + u64 failed_start; + + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + NULL, cached, NULL); + if (err == -EEXIST) { + if (failed_start > start) + clear_extent_bit(tree, start, failed_start - 1, + EXTENT_LOCKED, cached); + return 0; + } + return 1; +} + +/* + * Either insert or lock state struct between start and end use mask to tell + * us if waiting is desired. + */ +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state) +{ + struct extent_state *failed_state = NULL; + int err; + u64 failed_start; + + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + &failed_state, cached_state, NULL); + while (err == -EEXIST) { + if (failed_start != start) + clear_extent_bit(tree, start, failed_start - 1, + EXTENT_LOCKED, cached_state); + + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED, + &failed_state); + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, + &failed_start, &failed_state, + cached_state, NULL); + } + return err; +} + +void __cold extent_state_free_cachep(void) +{ + btrfs_extent_state_leak_debug_check(); + kmem_cache_destroy(extent_state_cache); +} + +int __init extent_state_init_cachep(void) +{ + extent_state_cache = kmem_cache_create("btrfs_extent_state", + sizeof(struct extent_state), 0, + SLAB_MEM_SPREAD, NULL); + if (!extent_state_cache) + return -ENOMEM; + + return 0; +} diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h new file mode 100644 index 0000000000..28c23a23d1 --- /dev/null +++ b/fs/btrfs/extent-io-tree.h @@ -0,0 +1,198 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_EXTENT_IO_TREE_H +#define BTRFS_EXTENT_IO_TREE_H + +#include "misc.h" + +struct extent_changeset; + +/* Bits for the extent state */ +enum { + ENUM_BIT(EXTENT_DIRTY), + ENUM_BIT(EXTENT_UPTODATE), + ENUM_BIT(EXTENT_LOCKED), + ENUM_BIT(EXTENT_NEW), + ENUM_BIT(EXTENT_DELALLOC), + ENUM_BIT(EXTENT_DEFRAG), + ENUM_BIT(EXTENT_BOUNDARY), + ENUM_BIT(EXTENT_NODATASUM), + ENUM_BIT(EXTENT_CLEAR_META_RESV), + ENUM_BIT(EXTENT_NEED_WAIT), + ENUM_BIT(EXTENT_NORESERVE), + ENUM_BIT(EXTENT_QGROUP_RESERVED), + ENUM_BIT(EXTENT_CLEAR_DATA_RESV), + /* + * Must be cleared only during ordered extent completion or on error + * paths if we did not manage to submit bios and create the ordered + * extents for the range. Should not be cleared during page release + * and page invalidation (if there is an ordered extent in flight), + * that is left for the ordered extent completion. + */ + ENUM_BIT(EXTENT_DELALLOC_NEW), + /* + * When an ordered extent successfully completes for a region marked as + * a new delalloc range, use this flag when clearing a new delalloc + * range to indicate that the VFS' inode number of bytes should be + * incremented and the inode's new delalloc bytes decremented, in an + * atomic way to prevent races with stat(2). + */ + ENUM_BIT(EXTENT_ADD_INODE_BYTES), + /* + * Set during truncate when we're clearing an entire range and we just + * want the extent states to go away. + */ + ENUM_BIT(EXTENT_CLEAR_ALL_BITS), + + /* + * This must be last. + * + * Bit not representing a state but a request for NOWAIT semantics, + * e.g. when allocating memory, and must be masked out from the other + * bits. + */ + ENUM_BIT(EXTENT_NOWAIT) +}; + +#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ + EXTENT_CLEAR_DATA_RESV) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \ + EXTENT_ADD_INODE_BYTES | \ + EXTENT_CLEAR_ALL_BITS) + +/* + * Redefined bits above which are used only in the device allocation tree, + * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV + * / EXTENT_CLEAR_DATA_RESV because they have special meaning to the bit + * manipulation functions + */ +#define CHUNK_ALLOCATED EXTENT_DIRTY +#define CHUNK_TRIMMED EXTENT_DEFRAG +#define CHUNK_STATE_MASK (CHUNK_ALLOCATED | \ + CHUNK_TRIMMED) + +enum { + IO_TREE_FS_PINNED_EXTENTS, + IO_TREE_FS_EXCLUDED_EXTENTS, + IO_TREE_BTREE_INODE_IO, + IO_TREE_INODE_IO, + IO_TREE_RELOC_BLOCKS, + IO_TREE_TRANS_DIRTY_PAGES, + IO_TREE_ROOT_DIRTY_LOG_PAGES, + IO_TREE_INODE_FILE_EXTENT, + IO_TREE_LOG_CSUM_RANGE, + IO_TREE_SELFTEST, + IO_TREE_DEVICE_ALLOC_STATE, +}; + +struct extent_io_tree { + struct rb_root state; + struct btrfs_fs_info *fs_info; + /* Inode associated with this tree, or NULL. */ + struct btrfs_inode *inode; + + /* Who owns this io tree, should be one of IO_TREE_* */ + u8 owner; + + spinlock_t lock; +}; + +struct extent_state { + u64 start; + u64 end; /* inclusive */ + struct rb_node rb_node; + + /* ADD NEW ELEMENTS AFTER THIS */ + wait_queue_head_t wq; + refcount_t refs; + u32 state; + +#ifdef CONFIG_BTRFS_DEBUG + struct list_head leak_list; +#endif +}; + +void extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner); +void extent_io_tree_release(struct extent_io_tree *tree); + +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached); + +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached); + +int __init extent_state_init_cachep(void); +void __cold extent_state_free_cachep(void); + +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, + u64 max_bytes, u32 bits, int contig, + struct extent_state **cached_state); + +void free_extent_state(struct extent_state *state); +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, int filled, struct extent_state *cached_state); +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset); +int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached, + struct extent_changeset *changeset); + +static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 end, u32 bits, + struct extent_state **cached) +{ + return __clear_extent_bit(tree, start, end, bits, cached, NULL); +} + +static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) +{ + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, NULL); +} + +static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, + u64 end, u32 bits) +{ + return clear_extent_bit(tree, start, end, bits, NULL); +} + +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset); +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state); + +static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state) +{ + return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, + cached_state, NULL); +} + +static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, cached); +} + +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u32 clear_bits, + struct extent_state **cached_state); + +bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state); +void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits); +int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits); +bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, + u64 *end, u64 max_bytes, + struct extent_state **cached_state); +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state); + +#endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c new file mode 100644 index 0000000000..b89b558b15 --- /dev/null +++ b/fs/btrfs/extent-tree.c @@ -0,0 +1,6177 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "extent-tree.h" +#include "tree-log.h" +#include "disk-io.h" +#include "print-tree.h" +#include "volumes.h" +#include "raid56.h" +#include "locking.h" +#include "free-space-cache.h" +#include "free-space-tree.h" +#include "sysfs.h" +#include "qgroup.h" +#include "ref-verify.h" +#include "space-info.h" +#include "block-rsv.h" +#include "delalloc-space.h" +#include "discard.h" +#include "rcu-string.h" +#include "zoned.h" +#include "dev-replace.h" +#include "fs.h" +#include "accessors.h" +#include "root-tree.h" +#include "file-item.h" +#include "orphan.h" +#include "tree-checker.h" + +#undef SCRAMBLE_DELAYED_REFS + + +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, u64 parent, + u64 root_objectid, u64 owner_objectid, + u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_extent_op *extra_op); +static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, + struct extent_buffer *leaf, + struct btrfs_extent_item *ei); +static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + u64 parent, u64 root_objectid, + u64 flags, u64 owner, u64 offset, + struct btrfs_key *ins, int ref_mod); +static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op); +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key); + +static int block_group_bits(struct btrfs_block_group *cache, u64 bits) +{ + return (cache->flags & bits) == bits; +} + +/* simple helper to search for an existing data extent at a given offset */ +int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) +{ + struct btrfs_root *root = btrfs_extent_root(fs_info, start); + int ret; + struct btrfs_key key; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = start; + key.offset = len; + key.type = BTRFS_EXTENT_ITEM_KEY; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + btrfs_free_path(path); + return ret; +} + +/* + * helper function to lookup reference count and flags of a tree block. + * + * the head node for delayed ref is used to store the sum of all the + * reference count modifications queued up in the rbtree. the head + * node may also store the extent flags to set. This way you can check + * to see what the reference count and extent flags would be if all of + * the delayed refs are not processed. + */ +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 offset, int metadata, u64 *refs, u64 *flags) +{ + struct btrfs_root *extent_root; + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + struct btrfs_key key; + u32 item_size; + u64 num_refs; + u64 extent_flags; + int ret; + + /* + * If we don't have skinny metadata, don't bother doing anything + * different + */ + if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { + offset = fs_info->nodesize; + metadata = 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (!trans) { + path->skip_locking = 1; + path->search_commit_root = 1; + } + +search_again: + key.objectid = bytenr; + key.offset = offset; + if (metadata) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + + extent_root = btrfs_extent_root(fs_info, bytenr); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out_free; + + if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == fs_info->nodesize) + ret = 0; + } + } + + if (ret == 0) { + leaf = path->nodes[0]; + item_size = btrfs_item_size(leaf, path->slots[0]); + if (item_size >= sizeof(*ei)) { + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + extent_flags = btrfs_extent_flags(leaf, ei); + } else { + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); + + goto out_free; + } + + BUG_ON(num_refs == 0); + } else { + num_refs = 0; + extent_flags = 0; + ret = 0; + } + + if (!trans) + goto out; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + /* + * Mutex was contended, block until it's released and try + * again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref_head(head); + goto search_again; + } + spin_lock(&head->lock); + if (head->extent_op && head->extent_op->update_flags) + extent_flags |= head->extent_op->flags_to_set; + else + BUG_ON(num_refs == 0); + + num_refs += head->ref_mod; + spin_unlock(&head->lock); + mutex_unlock(&head->mutex); + } + spin_unlock(&delayed_refs->lock); +out: + WARN_ON(num_refs == 0); + if (refs) + *refs = num_refs; + if (flags) + *flags = extent_flags; +out_free: + btrfs_free_path(path); + return ret; +} + +/* + * Back reference rules. Back refs have three main goals: + * + * 1) differentiate between all holders of references to an extent so that + * when a reference is dropped we can make sure it was a valid reference + * before freeing the extent. + * + * 2) Provide enough information to quickly find the holders of an extent + * if we notice a given block is corrupted or bad. + * + * 3) Make it easy to migrate blocks for FS shrinking or storage pool + * maintenance. This is actually the same as #2, but with a slightly + * different use case. + * + * There are two kinds of back refs. The implicit back refs is optimized + * for pointers in non-shared tree blocks. For a given pointer in a block, + * back refs of this kind provide information about the block's owner tree + * and the pointer's key. These information allow us to find the block by + * b-tree searching. The full back refs is for pointers in tree blocks not + * referenced by their owner trees. The location of tree block is recorded + * in the back refs. Actually the full back refs is generic, and can be + * used in all cases the implicit back refs is used. The major shortcoming + * of the full back refs is its overhead. Every time a tree block gets + * COWed, we have to update back refs entry for all pointers in it. + * + * For a newly allocated tree block, we use implicit back refs for + * pointers in it. This means most tree related operations only involve + * implicit back refs. For a tree block created in old transaction, the + * only way to drop a reference to it is COW it. So we can detect the + * event that tree block loses its owner tree's reference and do the + * back refs conversion. + * + * When a tree block is COWed through a tree, there are four cases: + * + * The reference count of the block is one and the tree is the block's + * owner tree. Nothing to do in this case. + * + * The reference count of the block is one and the tree is not the + * block's owner tree. In this case, full back refs is used for pointers + * in the block. Remove these full back refs, add implicit back refs for + * every pointers in the new block. + * + * The reference count of the block is greater than one and the tree is + * the block's owner tree. In this case, implicit back refs is used for + * pointers in the block. Add full back refs for every pointers in the + * block, increase lower level extents' reference counts. The original + * implicit back refs are entailed to the new block. + * + * The reference count of the block is greater than one and the tree is + * not the block's owner tree. Add implicit back refs for every pointer in + * the new block, increase lower level extents' reference count. + * + * Back Reference Key composing: + * + * The key objectid corresponds to the first byte in the extent, + * The key type is used to differentiate between types of back refs. + * There are different meanings of the key offset for different types + * of back refs. + * + * File extents can be referenced by: + * + * - multiple snapshots, subvolumes, or different generations in one subvol + * - different files inside a single subvolume + * - different offsets inside a file (bookend extents in file.c) + * + * The extent ref structure for the implicit back refs has fields for: + * + * - Objectid of the subvolume root + * - objectid of the file holding the reference + * - original offset in the file + * - how many bookend extents + * + * The key offset for the implicit back refs is hash of the first + * three fields. + * + * The extent ref structure for the full back refs has field for: + * + * - number of pointers in the tree leaf + * + * The key offset for the implicit back refs is the first byte of + * the tree leaf + * + * When a file extent is allocated, The implicit back refs is used. + * the fields are filled in: + * + * (root_key.objectid, inode objectid, offset in file, 1) + * + * When a file extent is removed file truncation, we find the + * corresponding implicit back refs and check the following fields: + * + * (btrfs_header_owner(leaf), inode objectid, offset in file) + * + * Btree extents can be referenced by: + * + * - Different subvolumes + * + * Both the implicit back refs and the full back refs for tree blocks + * only consist of key. The key offset for the implicit back refs is + * objectid of block's owner tree. The key offset for the full back refs + * is the first byte of parent block. + * + * When implicit back refs is used, information about the lowest key and + * level of the tree block are required. These information are stored in + * tree block info structure. + */ + +/* + * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, + * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, + * is_data == BTRFS_REF_TYPE_ANY, either type is OK. + */ +int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, + struct btrfs_extent_inline_ref *iref, + enum btrfs_inline_ref_type is_data) +{ + int type = btrfs_extent_inline_ref_type(eb, iref); + u64 offset = btrfs_extent_inline_ref_offset(eb, iref); + + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY || + type == BTRFS_SHARED_DATA_REF_KEY || + type == BTRFS_EXTENT_DATA_REF_KEY) { + if (is_data == BTRFS_REF_TYPE_BLOCK) { + if (type == BTRFS_TREE_BLOCK_REF_KEY) + return type; + if (type == BTRFS_SHARED_BLOCK_REF_KEY) { + ASSERT(eb->fs_info); + /* + * Every shared one has parent tree block, + * which must be aligned to sector size. + */ + if (offset && + IS_ALIGNED(offset, eb->fs_info->sectorsize)) + return type; + } + } else if (is_data == BTRFS_REF_TYPE_DATA) { + if (type == BTRFS_EXTENT_DATA_REF_KEY) + return type; + if (type == BTRFS_SHARED_DATA_REF_KEY) { + ASSERT(eb->fs_info); + /* + * Every shared one has parent tree block, + * which must be aligned to sector size. + */ + if (offset && + IS_ALIGNED(offset, eb->fs_info->sectorsize)) + return type; + } + } else { + ASSERT(is_data == BTRFS_REF_TYPE_ANY); + return type; + } + } + + WARN_ON(1); + btrfs_print_leaf(eb); + btrfs_err(eb->fs_info, + "eb %llu iref 0x%lx invalid extent inline ref type %d", + eb->start, (unsigned long)iref, type); + + return BTRFS_REF_TYPE_INVALID; +} + +u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) +{ + u32 high_crc = ~(u32)0; + u32 low_crc = ~(u32)0; + __le64 lenum; + + lenum = cpu_to_le64(root_objectid); + high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); + lenum = cpu_to_le64(owner); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + lenum = cpu_to_le64(offset); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + + return ((u64)high_crc << 31) ^ (u64)low_crc; +} + +static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref) +{ + return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), + btrfs_extent_data_ref_objectid(leaf, ref), + btrfs_extent_data_ref_offset(leaf, ref)); +} + +static int match_extent_data_ref(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + u64 root_objectid, u64 owner, u64 offset) +{ + if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != owner || + btrfs_extent_data_ref_offset(leaf, ref) != offset) + return 0; + return 1; +} + +static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid, + u64 owner, u64 offset) +{ + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); + struct btrfs_key key; + struct btrfs_extent_data_ref *ref; + struct extent_buffer *leaf; + u32 nritems; + int ret; + int recow; + int err = -ENOENT; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_DATA_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_EXTENT_DATA_REF_KEY; + key.offset = hash_extent_data_ref(root_objectid, + owner, offset); + } +again: + recow = 0; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto fail; + } + + if (parent) { + if (!ret) + return 0; + goto fail; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + while (1) { + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + err = ret; + if (ret) + goto fail; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + recow = 1; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != bytenr || + key.type != BTRFS_EXTENT_DATA_REF_KEY) + goto fail; + + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + + if (match_extent_data_ref(leaf, ref, root_objectid, + owner, offset)) { + if (recow) { + btrfs_release_path(path); + goto again; + } + err = 0; + break; + } + path->slots[0]++; + } +fail: + return err; +} + +static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid, u64 owner, + u64 offset, int refs_to_add) +{ + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); + struct btrfs_key key; + struct extent_buffer *leaf; + u32 size; + u32 num_refs; + int ret; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_DATA_REF_KEY; + key.offset = parent; + size = sizeof(struct btrfs_shared_data_ref); + } else { + key.type = BTRFS_EXTENT_DATA_REF_KEY; + key.offset = hash_extent_data_ref(root_objectid, + owner, offset); + size = sizeof(struct btrfs_extent_data_ref); + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, size); + if (ret && ret != -EEXIST) + goto fail; + + leaf = path->nodes[0]; + if (parent) { + struct btrfs_shared_data_ref *ref; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + if (ret == 0) { + btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); + } else { + num_refs = btrfs_shared_data_ref_count(leaf, ref); + num_refs += refs_to_add; + btrfs_set_shared_data_ref_count(leaf, ref, num_refs); + } + } else { + struct btrfs_extent_data_ref *ref; + while (ret == -EEXIST) { + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + if (match_extent_data_ref(leaf, ref, root_objectid, + owner, offset)) + break; + btrfs_release_path(path); + key.offset++; + ret = btrfs_insert_empty_item(trans, root, path, &key, + size); + if (ret && ret != -EEXIST) + goto fail; + + leaf = path->nodes[0]; + } + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + if (ret == 0) { + btrfs_set_extent_data_ref_root(leaf, ref, + root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, ref, owner); + btrfs_set_extent_data_ref_offset(leaf, ref, offset); + btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); + } else { + num_refs = btrfs_extent_data_ref_count(leaf, ref); + num_refs += refs_to_add; + btrfs_set_extent_data_ref_count(leaf, ref, num_refs); + } + } + btrfs_mark_buffer_dirty(trans, leaf); + ret = 0; +fail: + btrfs_release_path(path); + return ret; +} + +static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int refs_to_drop) +{ + struct btrfs_key key; + struct btrfs_extent_data_ref *ref1 = NULL; + struct btrfs_shared_data_ref *ref2 = NULL; + struct extent_buffer *leaf; + u32 num_refs = 0; + int ret = 0; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ref2 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); + } else { + btrfs_err(trans->fs_info, + "unrecognized backref key (%llu %u %llu)", + key.objectid, key.type, key.offset); + btrfs_abort_transaction(trans, -EUCLEAN); + return -EUCLEAN; + } + + BUG_ON(num_refs < refs_to_drop); + num_refs -= refs_to_drop; + + if (num_refs == 0) { + ret = btrfs_del_item(trans, root, path); + } else { + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) + btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); + else if (key.type == BTRFS_SHARED_DATA_REF_KEY) + btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); + btrfs_mark_buffer_dirty(trans, leaf); + } + return ret; +} + +static noinline u32 extent_data_ref_count(struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_data_ref *ref1; + struct btrfs_shared_data_ref *ref2; + u32 num_refs = 0; + int type; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (iref) { + /* + * If type is invalid, we should have bailed out earlier than + * this call. + */ + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); + ASSERT(type != BTRFS_REF_TYPE_INVALID); + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else { + ref2 = (struct btrfs_shared_data_ref *)(iref + 1); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); + } + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ref2 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); + } else { + WARN_ON(1); + } + return num_refs; +} + +static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid) +{ + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); + struct btrfs_key key; + int ret; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; + return ret; +} + +static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid) +{ + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); + struct btrfs_key key; + int ret; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + btrfs_release_path(path); + return ret; +} + +static inline int extent_ref_type(u64 parent, u64 owner) +{ + int type; + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (parent > 0) + type = BTRFS_SHARED_BLOCK_REF_KEY; + else + type = BTRFS_TREE_BLOCK_REF_KEY; + } else { + if (parent > 0) + type = BTRFS_SHARED_DATA_REF_KEY; + else + type = BTRFS_EXTENT_DATA_REF_KEY; + } + return type; +} + +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key) + +{ + for (; level < BTRFS_MAX_LEVEL; level++) { + if (!path->nodes[level]) + break; + if (path->slots[level] + 1 >= + btrfs_header_nritems(path->nodes[level])) + continue; + if (level == 0) + btrfs_item_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + else + btrfs_node_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + return 0; + } + return 1; +} + +/* + * look for inline back ref. if back ref is found, *ref_ret is set + * to the address of inline back ref, and 0 is returned. + * + * if back ref isn't found, *ref_ret is set to the address where it + * should be inserted, and -ENOENT is returned. + * + * if insert is true and there are too many inline back refs, the path + * points to the extent item, and -EAGAIN is returned. + * + * NOTE: inline back refs are ordered in the same way that back ref + * items in the tree are ordered. + */ +static noinline_for_stack +int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_extent_inline_ref **ref_ret, + u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int insert) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + u64 flags; + u64 item_size; + unsigned long ptr; + unsigned long end; + int extra_size; + int type; + int want; + int ret; + int err = 0; + bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); + int needed; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + want = extent_ref_type(parent, owner); + if (insert) { + extra_size = btrfs_extent_inline_ref_size(want); + path->search_for_extension = 1; + path->keep_locks = 1; + } else + extra_size = -1; + + /* + * Owner is our level, so we can just add one to get the level for the + * block we are interested in. + */ + if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = owner; + } + +again: + ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); + if (ret < 0) { + err = ret; + goto out; + } + + /* + * We may be a newly converted file system which still has the old fat + * extent entries for metadata, so try and see if we have one of those. + */ + if (ret > 0 && skinny_metadata) { + skinny_metadata = false; + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) + ret = 0; + } + if (ret) { + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + btrfs_release_path(path); + goto again; + } + } + + if (ret && !insert) { + err = -ENOENT; + goto out; + } else if (WARN_ON(ret)) { + btrfs_print_leaf(path->nodes[0]); + btrfs_err(fs_info, +"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu", + bytenr, num_bytes, parent, root_objectid, owner, + offset); + err = -EIO; + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size(leaf, path->slots[0]); + if (unlikely(item_size < sizeof(*ei))) { + err = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %llu expect >= %zu", + item_size, sizeof(*ei)); + btrfs_abort_transaction(trans, err); + goto out; + } + + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + item_size; + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { + ptr += sizeof(struct btrfs_tree_block_info); + BUG_ON(ptr > end); + } + + if (owner >= BTRFS_FIRST_FREE_OBJECTID) + needed = BTRFS_REF_TYPE_DATA; + else + needed = BTRFS_REF_TYPE_BLOCK; + + err = -ENOENT; + while (1) { + if (ptr >= end) { + if (ptr > end) { + err = -EUCLEAN; + btrfs_print_leaf(path->nodes[0]); + btrfs_crit(fs_info, +"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", + path->slots[0], root_objectid, owner, offset, parent); + } + break; + } + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); + if (type == BTRFS_REF_TYPE_INVALID) { + err = -EUCLEAN; + goto out; + } + + if (want < type) + break; + if (want > type) { + ptr += btrfs_extent_inline_ref_size(type); + continue; + } + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + struct btrfs_extent_data_ref *dref; + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + if (match_extent_data_ref(leaf, dref, root_objectid, + owner, offset)) { + err = 0; + break; + } + if (hash_extent_data_ref_item(leaf, dref) < + hash_extent_data_ref(root_objectid, owner, offset)) + break; + } else { + u64 ref_offset; + ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); + if (parent > 0) { + if (parent == ref_offset) { + err = 0; + break; + } + if (ref_offset < parent) + break; + } else { + if (root_objectid == ref_offset) { + err = 0; + break; + } + if (ref_offset < root_objectid) + break; + } + } + ptr += btrfs_extent_inline_ref_size(type); + } + if (err == -ENOENT && insert) { + if (item_size + extra_size >= + BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { + err = -EAGAIN; + goto out; + } + /* + * To add new inline back ref, we have to make sure + * there is no corresponding back ref item. + * For simplicity, we just do not add new inline back + * ref if there is any kind of item for this block + */ + if (find_next_key(path, 0, &key) == 0 && + key.objectid == bytenr && + key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { + err = -EAGAIN; + goto out; + } + } + *ref_ret = (struct btrfs_extent_inline_ref *)ptr; +out: + if (insert) { + path->keep_locks = 0; + path->search_for_extension = 0; + btrfs_unlock_up_safe(path, 1); + } + return err; +} + +/* + * helper to add new inline back ref + */ +static noinline_for_stack +void setup_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + unsigned long ptr; + unsigned long end; + unsigned long item_offset; + u64 refs; + int size; + int type; + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + item_offset = (unsigned long)iref - (unsigned long)ei; + + type = extent_ref_type(parent, owner); + size = btrfs_extent_inline_ref_size(type); + + btrfs_extend_item(trans, path, size); + + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + refs += refs_to_add; + btrfs_set_extent_refs(leaf, ei, refs); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + + ptr = (unsigned long)ei + item_offset; + end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]); + if (ptr < end - size) + memmove_extent_buffer(leaf, ptr + size, ptr, + end - size - ptr); + + iref = (struct btrfs_extent_inline_ref *)ptr; + btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + struct btrfs_extent_data_ref *dref; + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, dref, owner); + btrfs_set_extent_data_ref_offset(leaf, dref, offset); + btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_shared_data_ref *sref; + sref = (struct btrfs_shared_data_ref *)(iref + 1); + btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + btrfs_mark_buffer_dirty(trans, leaf); +} + +static int lookup_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_extent_inline_ref **ref_ret, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset) +{ + int ret; + + ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr, + num_bytes, parent, root_objectid, + owner, offset, 0); + if (ret != -ENOENT) + return ret; + + btrfs_release_path(path); + *ref_ret = NULL; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = lookup_tree_block_ref(trans, path, bytenr, parent, + root_objectid); + } else { + ret = lookup_extent_data_ref(trans, path, bytenr, parent, + root_objectid, owner, offset); + } + return ret; +} + +/* + * helper to update/remove inline back ref + */ +static noinline_for_stack int update_inline_extent_backref( + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_mod, + struct btrfs_delayed_extent_op *extent_op) +{ + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_extent_item *ei; + struct btrfs_extent_data_ref *dref = NULL; + struct btrfs_shared_data_ref *sref = NULL; + unsigned long ptr; + unsigned long end; + u32 item_size; + int size; + int type; + u64 refs; + + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) { + struct btrfs_key key; + u32 extent_size; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.type == BTRFS_METADATA_ITEM_KEY) + extent_size = fs_info->nodesize; + else + extent_size = key.offset; + btrfs_print_leaf(leaf); + btrfs_err(fs_info, + "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu", + key.objectid, extent_size, refs_to_mod, refs); + return -EUCLEAN; + } + refs += refs_to_mod; + btrfs_set_extent_refs(leaf, ei, refs); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); + /* + * Function btrfs_get_extent_inline_ref_type() has already printed + * error messages. + */ + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) + return -EUCLEAN; + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + refs = btrfs_extent_data_ref_count(leaf, dref); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + sref = (struct btrfs_shared_data_ref *)(iref + 1); + refs = btrfs_shared_data_ref_count(leaf, sref); + } else { + refs = 1; + /* + * For tree blocks we can only drop one ref for it, and tree + * blocks should not have refs > 1. + * + * Furthermore if we're inserting a new inline backref, we + * won't reach this path either. That would be + * setup_inline_extent_backref(). + */ + if (unlikely(refs_to_mod != -1)) { + struct btrfs_key key; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + btrfs_print_leaf(leaf); + btrfs_err(fs_info, + "invalid refs_to_mod for tree block %llu, has %d expect -1", + key.objectid, refs_to_mod); + return -EUCLEAN; + } + } + + if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) { + struct btrfs_key key; + u32 extent_size; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.type == BTRFS_METADATA_ITEM_KEY) + extent_size = fs_info->nodesize; + else + extent_size = key.offset; + btrfs_print_leaf(leaf); + btrfs_err(fs_info, +"invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu", + (unsigned long)iref, key.objectid, extent_size, + refs_to_mod, refs); + return -EUCLEAN; + } + refs += refs_to_mod; + + if (refs > 0) { + if (type == BTRFS_EXTENT_DATA_REF_KEY) + btrfs_set_extent_data_ref_count(leaf, dref, refs); + else + btrfs_set_shared_data_ref_count(leaf, sref, refs); + } else { + size = btrfs_extent_inline_ref_size(type); + item_size = btrfs_item_size(leaf, path->slots[0]); + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + if (ptr + size < end) + memmove_extent_buffer(leaf, ptr, ptr + size, + end - ptr - size); + item_size -= size; + btrfs_truncate_item(trans, path, item_size, 1); + } + btrfs_mark_buffer_dirty(trans, leaf); + return 0; +} + +static noinline_for_stack +int insert_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, + u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_extent_inline_ref *iref; + int ret; + + ret = lookup_inline_extent_backref(trans, path, &iref, bytenr, + num_bytes, parent, root_objectid, + owner, offset, 1); + if (ret == 0) { + /* + * We're adding refs to a tree block we already own, this + * should not happen at all. + */ + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_print_leaf(path->nodes[0]); + btrfs_crit(trans->fs_info, +"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u", + bytenr, num_bytes, root_objectid, path->slots[0]); + return -EUCLEAN; + } + ret = update_inline_extent_backref(trans, path, iref, + refs_to_add, extent_op); + } else if (ret == -ENOENT) { + setup_inline_extent_backref(trans, path, iref, parent, + root_objectid, owner, offset, + refs_to_add, extent_op); + ret = 0; + } + return ret; +} + +static int remove_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_drop, int is_data) +{ + int ret = 0; + + BUG_ON(!is_data && refs_to_drop != 1); + if (iref) + ret = update_inline_extent_backref(trans, path, iref, + -refs_to_drop, NULL); + else if (is_data) + ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + else + ret = btrfs_del_item(trans, root, path); + return ret; +} + +static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, + u64 *discarded_bytes) +{ + int j, ret = 0; + u64 bytes_left, end; + u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); + + /* Adjust the range to be aligned to 512B sectors if necessary. */ + if (start != aligned_start) { + len -= aligned_start - start; + len = round_down(len, 1 << SECTOR_SHIFT); + start = aligned_start; + } + + *discarded_bytes = 0; + + if (!len) + return 0; + + end = start + len; + bytes_left = len; + + /* Skip any superblocks on this device. */ + for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { + u64 sb_start = btrfs_sb_offset(j); + u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; + u64 size = sb_start - start; + + if (!in_range(sb_start, start, bytes_left) && + !in_range(sb_end, start, bytes_left) && + !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) + continue; + + /* + * Superblock spans beginning of range. Adjust start and + * try again. + */ + if (sb_start <= start) { + start += sb_end - start; + if (start > end) { + bytes_left = 0; + break; + } + bytes_left = end - start; + continue; + } + + if (size) { + ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, + size >> SECTOR_SHIFT, + GFP_NOFS); + if (!ret) + *discarded_bytes += size; + else if (ret != -EOPNOTSUPP) + return ret; + } + + start = sb_end; + if (start > end) { + bytes_left = 0; + break; + } + bytes_left = end - start; + } + + if (bytes_left) { + ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, + bytes_left >> SECTOR_SHIFT, + GFP_NOFS); + if (!ret) + *discarded_bytes += bytes_left; + } + return ret; +} + +static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes) +{ + struct btrfs_device *dev = stripe->dev; + struct btrfs_fs_info *fs_info = dev->fs_info; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + u64 phys = stripe->physical; + u64 len = stripe->length; + u64 discarded = 0; + int ret = 0; + + /* Zone reset on a zoned filesystem */ + if (btrfs_can_zone_reset(dev, phys, len)) { + u64 src_disc; + + ret = btrfs_reset_device_zone(dev, phys, len, &discarded); + if (ret) + goto out; + + if (!btrfs_dev_replace_is_ongoing(dev_replace) || + dev != dev_replace->srcdev) + goto out; + + src_disc = discarded; + + /* Send to replace target as well */ + ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, + &discarded); + discarded += src_disc; + } else if (bdev_max_discard_sectors(stripe->dev->bdev)) { + ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); + } else { + ret = 0; + *bytes = 0; + } + +out: + *bytes = discarded; + return ret; +} + +int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, + u64 num_bytes, u64 *actual_bytes) +{ + int ret = 0; + u64 discarded_bytes = 0; + u64 end = bytenr + num_bytes; + u64 cur = bytenr; + + /* + * Avoid races with device replace and make sure the devices in the + * stripes don't go away while we are discarding. + */ + btrfs_bio_counter_inc_blocked(fs_info); + while (cur < end) { + struct btrfs_discard_stripe *stripes; + unsigned int num_stripes; + int i; + + num_bytes = end - cur; + stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes); + if (IS_ERR(stripes)) { + ret = PTR_ERR(stripes); + if (ret == -EOPNOTSUPP) + ret = 0; + break; + } + + for (i = 0; i < num_stripes; i++) { + struct btrfs_discard_stripe *stripe = stripes + i; + u64 bytes; + + if (!stripe->dev->bdev) { + ASSERT(btrfs_test_opt(fs_info, DEGRADED)); + continue; + } + + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, + &stripe->dev->dev_state)) + continue; + + ret = do_discard_extent(stripe, &bytes); + if (ret) { + /* + * Keep going if discard is not supported by the + * device. + */ + if (ret != -EOPNOTSUPP) + break; + ret = 0; + } else { + discarded_bytes += bytes; + } + } + kfree(stripes); + if (ret) + break; + cur += num_bytes; + } + btrfs_bio_counter_dec(fs_info); + if (actual_bytes) + *actual_bytes = discarded_bytes; + return ret; +} + +/* Can return -ENOMEM */ +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_ref *generic_ref) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; + + ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && + generic_ref->action); + BUG_ON(generic_ref->type == BTRFS_REF_METADATA && + generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID); + + if (generic_ref->type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0); + + btrfs_ref_tree_mod(fs_info, generic_ref); + + return ret; +} + +/* + * __btrfs_inc_extent_ref - insert backreference for a given extent + * + * The counterpart is in __btrfs_free_extent(), with examples and more details + * how it works. + * + * @trans: Handle of transaction + * + * @node: The delayed ref node used to get the bytenr/length for + * extent whose references are incremented. + * + * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/ + * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical + * bytenr of the parent block. Since new extents are always + * created with indirect references, this will only be the case + * when relocating a shared extent. In that case, root_objectid + * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must + * be 0 + * + * @root_objectid: The id of the root where this modification has originated, + * this can be either one of the well-known metadata trees or + * the subvolume id which references this extent. + * + * @owner: For data extents it is the inode number of the owning file. + * For metadata extents this parameter holds the level in the + * tree of the extent. + * + * @offset: For metadata extents the offset is ignored and is currently + * always passed as 0. For data extents it is the fileoffset + * this extent belongs to. + * + * @refs_to_add Number of references to add + * + * @extent_op Pointer to a structure, holding information necessary when + * updating a tree block's flags + * + */ +static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_extent_item *item; + struct btrfs_key key; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; + u64 refs; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* this will setup the path even if it fails to insert the back ref */ + ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, + parent, root_objectid, owner, + offset, refs_to_add, extent_op); + if ((ret < 0 && ret != -EAGAIN) || !ret) + goto out; + + /* + * Ok we had -EAGAIN which means we didn't have space to insert and + * inline extent ref, so just update the reference count and add a + * normal backref. + */ + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, item); + btrfs_set_extent_refs(leaf, item, refs + refs_to_add); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, item); + + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_release_path(path); + + /* now insert the actual backref */ + if (owner < BTRFS_FIRST_FREE_OBJECTID) + ret = insert_tree_block_ref(trans, path, bytenr, parent, + root_objectid); + else + ret = insert_extent_data_ref(trans, path, bytenr, parent, + root_objectid, owner, offset, + refs_to_add); + + if (ret) + btrfs_abort_transaction(trans, ret); +out: + btrfs_free_path(path); + return ret; +} + +static int run_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + bool insert_reserved) +{ + int ret = 0; + struct btrfs_delayed_data_ref *ref; + struct btrfs_key ins; + u64 parent = 0; + u64 ref_root = 0; + u64 flags = 0; + + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + ref = btrfs_delayed_node_to_data_ref(node); + trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); + + if (node->type == BTRFS_SHARED_DATA_REF_KEY) + parent = ref->parent; + ref_root = ref->root; + + if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + if (extent_op) + flags |= extent_op->flags_to_set; + ret = alloc_reserved_file_extent(trans, parent, ref_root, + flags, ref->objectid, + ref->offset, &ins, + node->ref_mod); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, + ref->objectid, ref->offset, + node->ref_mod, extent_op); + } else if (node->action == BTRFS_DROP_DELAYED_REF) { + ret = __btrfs_free_extent(trans, node, parent, + ref_root, ref->objectid, + ref->offset, node->ref_mod, + extent_op); + } else { + BUG(); + } + return ret; +} + +static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, + struct extent_buffer *leaf, + struct btrfs_extent_item *ei) +{ + u64 flags = btrfs_extent_flags(leaf, ei); + if (extent_op->update_flags) { + flags |= extent_op->flags_to_set; + btrfs_set_extent_flags(leaf, ei, flags); + } + + if (extent_op->update_key) { + struct btrfs_tree_block_info *bi; + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); + bi = (struct btrfs_tree_block_info *)(ei + 1); + btrfs_set_tree_block_key(leaf, bi, &extent_op->key); + } +} + +static int run_delayed_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root; + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + u32 item_size; + int ret; + int err = 0; + int metadata = 1; + + if (TRANS_ABORTED(trans)) + return 0; + + if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + metadata = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = head->bytenr; + + if (metadata) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = extent_op->level; + } else { + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = head->num_bytes; + } + + root = btrfs_extent_root(fs_info, key.objectid); +again: + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + if (metadata) { + if (path->slots[0] > 0) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == head->bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == head->num_bytes) + ret = 0; + } + if (ret > 0) { + btrfs_release_path(path); + metadata = 0; + + key.objectid = head->bytenr; + key.offset = head->num_bytes; + key.type = BTRFS_EXTENT_ITEM_KEY; + goto again; + } + } else { + err = -EUCLEAN; + btrfs_err(fs_info, + "missing extent item for extent %llu num_bytes %llu level %d", + head->bytenr, head->num_bytes, extent_op->level); + goto out; + } + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size(leaf, path->slots[0]); + + if (unlikely(item_size < sizeof(*ei))) { + err = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); + btrfs_abort_transaction(trans, err); + goto out; + } + + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + __run_delayed_extent_op(extent_op, leaf, ei); + + btrfs_mark_buffer_dirty(trans, leaf); +out: + btrfs_free_path(path); + return err; +} + +static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + bool insert_reserved) +{ + int ret = 0; + struct btrfs_delayed_tree_ref *ref; + u64 parent = 0; + u64 ref_root = 0; + + ref = btrfs_delayed_node_to_tree_ref(node); + trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action); + + if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) + parent = ref->parent; + ref_root = ref->root; + + if (unlikely(node->ref_mod != 1)) { + btrfs_err(trans->fs_info, + "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu", + node->bytenr, node->ref_mod, node->action, ref_root, + parent); + return -EUCLEAN; + } + if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + BUG_ON(!extent_op || !extent_op->update_flags); + ret = alloc_reserved_tree_block(trans, node, extent_op); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, + ref->level, 0, 1, extent_op); + } else if (node->action == BTRFS_DROP_DELAYED_REF) { + ret = __btrfs_free_extent(trans, node, parent, ref_root, + ref->level, 0, 1, extent_op); + } else { + BUG(); + } + return ret; +} + +/* helper function to actually process a single delayed ref entry */ +static int run_one_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + bool insert_reserved) +{ + int ret = 0; + + if (TRANS_ABORTED(trans)) { + if (insert_reserved) + btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); + return 0; + } + + if (node->type == BTRFS_TREE_BLOCK_REF_KEY || + node->type == BTRFS_SHARED_BLOCK_REF_KEY) + ret = run_delayed_tree_ref(trans, node, extent_op, + insert_reserved); + else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) + ret = run_delayed_data_ref(trans, node, extent_op, + insert_reserved); + else + BUG(); + if (ret && insert_reserved) + btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); + if (ret < 0) + btrfs_err(trans->fs_info, +"failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d", + node->bytenr, node->num_bytes, node->type, + node->action, node->ref_mod, ret); + return ret; +} + +static inline struct btrfs_delayed_ref_node * +select_delayed_ref(struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_node *ref; + + if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) + return NULL; + + /* + * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. + * This is to prevent a ref count from going down to zero, which deletes + * the extent item from the extent tree, when there still are references + * to add, which would fail because they would not find the extent item. + */ + if (!list_empty(&head->ref_add_list)) + return list_first_entry(&head->ref_add_list, + struct btrfs_delayed_ref_node, add_list); + + ref = rb_entry(rb_first_cached(&head->ref_tree), + struct btrfs_delayed_ref_node, ref_node); + ASSERT(list_empty(&ref->add_list)); + return ref; +} + +static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + spin_lock(&delayed_refs->lock); + head->processing = false; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(head); +} + +static struct btrfs_delayed_extent_op *cleanup_extent_op( + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_extent_op *extent_op = head->extent_op; + + if (!extent_op) + return NULL; + + if (head->must_insert_reserved) { + head->extent_op = NULL; + btrfs_free_delayed_extent_op(extent_op); + return NULL; + } + return extent_op; +} + +static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_extent_op *extent_op; + int ret; + + extent_op = cleanup_extent_op(head); + if (!extent_op) + return 0; + head->extent_op = NULL; + spin_unlock(&head->lock); + ret = run_delayed_extent_op(trans, head, extent_op); + btrfs_free_delayed_extent_op(extent_op); + return ret ? ret : 1; +} + +void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + int nr_items = 1; /* Dropping this ref head update. */ + + /* + * We had csum deletions accounted for in our delayed refs rsv, we need + * to drop the csum leaves for this update from our delayed_refs_rsv. + */ + if (head->total_ref_mod < 0 && head->is_data) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= head->num_bytes; + spin_unlock(&delayed_refs->lock); + nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); + } + + btrfs_delayed_refs_rsv_release(fs_info, nr_items); +} + +static int cleanup_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + delayed_refs = &trans->transaction->delayed_refs; + + ret = run_and_cleanup_extent_op(trans, head); + if (ret < 0) { + unselect_delayed_ref_head(delayed_refs, head); + btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); + return ret; + } else if (ret) { + return ret; + } + + /* + * Need to drop our head ref lock and re-acquire the delayed ref lock + * and then re-check to make sure nobody got added. + */ + spin_unlock(&head->lock); + spin_lock(&delayed_refs->lock); + spin_lock(&head->lock); + if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) { + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + return 1; + } + btrfs_delete_ref_head(delayed_refs, head); + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + + if (head->must_insert_reserved) { + btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); + if (head->is_data) { + struct btrfs_root *csum_root; + + csum_root = btrfs_csum_root(fs_info, head->bytenr); + ret = btrfs_del_csums(trans, csum_root, head->bytenr, + head->num_bytes); + } + } + + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); + + trace_run_delayed_ref_head(fs_info, head, 0); + btrfs_delayed_ref_unlock(head); + btrfs_put_delayed_ref_head(head); + return ret; +} + +static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( + struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_delayed_ref_head *head = NULL; + int ret; + + spin_lock(&delayed_refs->lock); + head = btrfs_select_ref_head(delayed_refs); + if (!head) { + spin_unlock(&delayed_refs->lock); + return head; + } + + /* + * Grab the lock that says we are going to process all the refs for + * this head + */ + ret = btrfs_delayed_ref_lock(delayed_refs, head); + spin_unlock(&delayed_refs->lock); + + /* + * We may have dropped the spin lock to get the head mutex lock, and + * that might have given someone else time to free the head. If that's + * true, it has been removed from our list and we can move on. + */ + if (ret == -EAGAIN) + head = ERR_PTR(-EAGAIN); + + return head; +} + +static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *locked_ref) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_extent_op *extent_op; + struct btrfs_delayed_ref_node *ref; + bool must_insert_reserved; + int ret; + + delayed_refs = &trans->transaction->delayed_refs; + + lockdep_assert_held(&locked_ref->mutex); + lockdep_assert_held(&locked_ref->lock); + + while ((ref = select_delayed_ref(locked_ref))) { + if (ref->seq && + btrfs_check_delayed_seq(fs_info, ref->seq)) { + spin_unlock(&locked_ref->lock); + unselect_delayed_ref_head(delayed_refs, locked_ref); + return -EAGAIN; + } + + rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); + RB_CLEAR_NODE(&ref->ref_node); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); + /* + * When we play the delayed ref, also correct the ref_mod on + * head + */ + switch (ref->action) { + case BTRFS_ADD_DELAYED_REF: + case BTRFS_ADD_DELAYED_EXTENT: + locked_ref->ref_mod -= ref->ref_mod; + break; + case BTRFS_DROP_DELAYED_REF: + locked_ref->ref_mod += ref->ref_mod; + break; + default: + WARN_ON(1); + } + atomic_dec(&delayed_refs->num_entries); + + /* + * Record the must_insert_reserved flag before we drop the + * spin lock. + */ + must_insert_reserved = locked_ref->must_insert_reserved; + locked_ref->must_insert_reserved = false; + + extent_op = locked_ref->extent_op; + locked_ref->extent_op = NULL; + spin_unlock(&locked_ref->lock); + + ret = run_one_delayed_ref(trans, ref, extent_op, + must_insert_reserved); + + btrfs_free_delayed_extent_op(extent_op); + if (ret) { + unselect_delayed_ref_head(delayed_refs, locked_ref); + btrfs_put_delayed_ref(ref); + return ret; + } + + btrfs_put_delayed_ref(ref); + cond_resched(); + + spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); + } + + return 0; +} + +/* + * Returns 0 on success or if called with an already aborted transaction. + * Returns -ENOMEM or -EIO on failure and will abort the transaction. + */ +static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + unsigned long nr) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *locked_ref = NULL; + int ret; + unsigned long count = 0; + + delayed_refs = &trans->transaction->delayed_refs; + do { + if (!locked_ref) { + locked_ref = btrfs_obtain_ref_head(trans); + if (IS_ERR_OR_NULL(locked_ref)) { + if (PTR_ERR(locked_ref) == -EAGAIN) { + continue; + } else { + break; + } + } + count++; + } + /* + * We need to try and merge add/drops of the same ref since we + * can run into issues with relocate dropping the implicit ref + * and then it being added back again before the drop can + * finish. If we merged anything we need to re-loop so we can + * get a good ref. + * Or we can get node references of the same type that weren't + * merged when created due to bumps in the tree mod seq, and + * we need to merge them to prevent adding an inline extent + * backref before dropping it (triggering a BUG_ON at + * insert_inline_extent_backref()). + */ + spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); + + ret = btrfs_run_delayed_refs_for_head(trans, locked_ref); + if (ret < 0 && ret != -EAGAIN) { + /* + * Error, btrfs_run_delayed_refs_for_head already + * unlocked everything so just bail out + */ + return ret; + } else if (!ret) { + /* + * Success, perform the usual cleanup of a processed + * head + */ + ret = cleanup_ref_head(trans, locked_ref); + if (ret > 0 ) { + /* We dropped our lock, we need to loop. */ + ret = 0; + continue; + } else if (ret) { + return ret; + } + } + + /* + * Either success case or btrfs_run_delayed_refs_for_head + * returned -EAGAIN, meaning we need to select another head + */ + + locked_ref = NULL; + cond_resched(); + } while ((nr != -1 && count < nr) || locked_ref); + + return 0; +} + +#ifdef SCRAMBLE_DELAYED_REFS +/* + * Normally delayed refs get processed in ascending bytenr order. This + * correlates in most cases to the order added. To expose dependencies on this + * order, we start to process the tree in the middle instead of the beginning + */ +static u64 find_middle(struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + struct btrfs_delayed_ref_node *entry; + int alt = 1; + u64 middle; + u64 first = 0, last = 0; + + n = rb_first(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + first = entry->bytenr; + } + n = rb_last(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + last = entry->bytenr; + } + n = root->rb_node; + + while (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + WARN_ON(!entry->in_tree); + + middle = entry->bytenr; + + if (alt) + n = n->rb_left; + else + n = n->rb_right; + + alt = 1 - alt; + } + return middle; +} +#endif + +/* + * this starts processing the delayed reference count updates and + * extent insertions we have queued up so far. count can be + * 0, which means to process everything in the tree at the start + * of the run (but not newly added entries), or it can be some target + * number you'd like to process. + * + * Returns 0 on success or if called with an aborted transaction + * Returns <0 on error and aborts the transaction + */ +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + unsigned long count) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *head; + int ret; + int run_all = count == (unsigned long)-1; + + /* We'll clean this up in btrfs_cleanup_transaction */ + if (TRANS_ABORTED(trans)) + return 0; + + if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) + return 0; + + delayed_refs = &trans->transaction->delayed_refs; + if (count == 0) + count = delayed_refs->num_heads_ready; + +again: +#ifdef SCRAMBLE_DELAYED_REFS + delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); +#endif + ret = __btrfs_run_delayed_refs(trans, count); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + if (run_all) { + btrfs_create_pending_block_groups(trans); + + spin_lock(&delayed_refs->lock); + node = rb_first_cached(&delayed_refs->href_root); + if (!node) { + spin_unlock(&delayed_refs->lock); + goto out; + } + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); + + /* Mutex was contended, block until it's released and retry. */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + + btrfs_put_delayed_ref_head(head); + cond_resched(); + goto again; + } +out: + return 0; +} + +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct extent_buffer *eb, u64 flags) +{ + struct btrfs_delayed_extent_op *extent_op; + int level = btrfs_header_level(eb); + int ret; + + extent_op = btrfs_alloc_delayed_extent_op(); + if (!extent_op) + return -ENOMEM; + + extent_op->flags_to_set = flags; + extent_op->update_flags = true; + extent_op->update_key = false; + extent_op->level = level; + + ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); + if (ret) + btrfs_free_delayed_extent_op(extent_op); + return ret; +} + +static noinline int check_delayed_ref(struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, u64 offset, u64 bytenr) +{ + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_data_ref *data_ref; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_transaction *cur_trans; + struct rb_node *node; + int ret = 0; + + spin_lock(&root->fs_info->trans_lock); + cur_trans = root->fs_info->running_transaction; + if (cur_trans) + refcount_inc(&cur_trans->use_count); + spin_unlock(&root->fs_info->trans_lock); + if (!cur_trans) + return 0; + + delayed_refs = &cur_trans->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + if (!head) { + spin_unlock(&delayed_refs->lock); + btrfs_put_transaction(cur_trans); + return 0; + } + + if (!mutex_trylock(&head->mutex)) { + if (path->nowait) { + spin_unlock(&delayed_refs->lock); + btrfs_put_transaction(cur_trans); + return -EAGAIN; + } + + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + /* + * Mutex was contended, block until it's released and let + * caller try again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref_head(head); + btrfs_put_transaction(cur_trans); + return -EAGAIN; + } + spin_unlock(&delayed_refs->lock); + + spin_lock(&head->lock); + /* + * XXX: We should replace this with a proper search function in the + * future. + */ + for (node = rb_first_cached(&head->ref_tree); node; + node = rb_next(node)) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); + /* If it's a shared ref we know a cross reference exists */ + if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { + ret = 1; + break; + } + + data_ref = btrfs_delayed_node_to_data_ref(ref); + + /* + * If our ref doesn't match the one we're currently looking at + * then we have a cross reference. + */ + if (data_ref->root != root->root_key.objectid || + data_ref->objectid != objectid || + data_ref->offset != offset) { + ret = 1; + break; + } + } + spin_unlock(&head->lock); + mutex_unlock(&head->mutex); + btrfs_put_transaction(cur_trans); + return ret; +} + +static noinline int check_committed_ref(struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, u64 offset, u64 bytenr, + bool strict) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); + struct extent_buffer *leaf; + struct btrfs_extent_data_ref *ref; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_item *ei; + struct btrfs_key key; + u32 item_size; + int type; + int ret; + + key.objectid = bytenr; + key.offset = (u64)-1; + key.type = BTRFS_EXTENT_ITEM_KEY; + + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); /* Corruption */ + + ret = -ENOENT; + if (path->slots[0] == 0) + goto out; + + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) + goto out; + + ret = 1; + item_size = btrfs_item_size(leaf, path->slots[0]); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + + /* If extent item has more than 1 inline ref then it's shared */ + if (item_size != sizeof(*ei) + + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) + goto out; + + /* + * If extent created before last snapshot => it's shared unless the + * snapshot has been deleted. Use the heuristic if strict is false. + */ + if (!strict && + (btrfs_extent_generation(leaf, ei) <= + btrfs_root_last_snapshot(&root->root_item))) + goto out; + + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + + /* If this extent has SHARED_DATA_REF then it's shared */ + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); + if (type != BTRFS_EXTENT_DATA_REF_KEY) + goto out; + + ref = (struct btrfs_extent_data_ref *)(&iref->offset); + if (btrfs_extent_refs(leaf, ei) != + btrfs_extent_data_ref_count(leaf, ref) || + btrfs_extent_data_ref_root(leaf, ref) != + root->root_key.objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != objectid || + btrfs_extent_data_ref_offset(leaf, ref) != offset) + goto out; + + ret = 0; +out: + return ret; +} + +int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, + u64 bytenr, bool strict, struct btrfs_path *path) +{ + int ret; + + do { + ret = check_committed_ref(root, path, objectid, + offset, bytenr, strict); + if (ret && ret != -ENOENT) + goto out; + + ret = check_delayed_ref(root, path, objectid, offset, bytenr); + } while (ret == -EAGAIN); + +out: + btrfs_release_path(path); + if (btrfs_is_data_reloc_root(root)) + WARN_ON(ret > 0); + return ret; +} + +static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + int full_backref, int inc) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 bytenr; + u64 num_bytes; + u64 parent; + u64 ref_root; + u32 nritems; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + struct btrfs_ref generic_ref = { 0 }; + bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); + int i; + int action; + int level; + int ret = 0; + + if (btrfs_is_testing(fs_info)) + return 0; + + ref_root = btrfs_header_owner(buf); + nritems = btrfs_header_nritems(buf); + level = btrfs_header_level(buf); + + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0) + return 0; + + if (full_backref) + parent = buf->start; + else + parent = 0; + if (inc) + action = BTRFS_ADD_DELAYED_REF; + else + action = BTRFS_DROP_DELAYED_REF; + + for (i = 0; i < nritems; i++) { + if (level == 0) { + btrfs_item_key_to_cpu(buf, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(buf, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(buf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (bytenr == 0) + continue; + + num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + key.offset -= btrfs_file_extent_offset(buf, fi); + btrfs_init_generic_ref(&generic_ref, action, bytenr, + num_bytes, parent); + btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, + key.offset, root->root_key.objectid, + for_reloc); + if (inc) + ret = btrfs_inc_extent_ref(trans, &generic_ref); + else + ret = btrfs_free_extent(trans, &generic_ref); + if (ret) + goto fail; + } else { + bytenr = btrfs_node_blockptr(buf, i); + num_bytes = fs_info->nodesize; + btrfs_init_generic_ref(&generic_ref, action, bytenr, + num_bytes, parent); + btrfs_init_tree_ref(&generic_ref, level - 1, ref_root, + root->root_key.objectid, for_reloc); + if (inc) + ret = btrfs_inc_extent_ref(trans, &generic_ref); + else + ret = btrfs_free_extent(trans, &generic_ref); + if (ret) + goto fail; + } + } + return 0; +fail: + return ret; +} + +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref) +{ + return __btrfs_mod_ref(trans, root, buf, full_backref, 1); +} + +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref) +{ + return __btrfs_mod_ref(trans, root, buf, full_backref, 0); +} + +static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 flags; + u64 ret; + + if (data) + flags = BTRFS_BLOCK_GROUP_DATA; + else if (root == fs_info->chunk_root) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + + ret = btrfs_get_alloc_profile(fs_info, flags); + return ret; +} + +static u64 first_logical_byte(struct btrfs_fs_info *fs_info) +{ + struct rb_node *leftmost; + u64 bytenr = 0; + + read_lock(&fs_info->block_group_cache_lock); + /* Get the block group with the lowest logical start address. */ + leftmost = rb_first_cached(&fs_info->block_group_cache_tree); + if (leftmost) { + struct btrfs_block_group *bg; + + bg = rb_entry(leftmost, struct btrfs_block_group, cache_node); + bytenr = bg->start; + } + read_unlock(&fs_info->block_group_cache_lock); + + return bytenr; +} + +static int pin_down_extent(struct btrfs_trans_handle *trans, + struct btrfs_block_group *cache, + u64 bytenr, u64 num_bytes, int reserved) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned += num_bytes; + btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, + num_bytes); + if (reserved) { + cache->reserved -= num_bytes; + cache->space_info->bytes_reserved -= num_bytes; + } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); + return 0; +} + +int btrfs_pin_extent(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, int reserved) +{ + struct btrfs_block_group *cache; + + cache = btrfs_lookup_block_group(trans->fs_info, bytenr); + BUG_ON(!cache); /* Logic error */ + + pin_down_extent(trans, cache, bytenr, num_bytes, reserved); + + btrfs_put_block_group(cache); + return 0; +} + +/* + * this function must be called within transaction + */ +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_block_group *cache; + int ret; + + cache = btrfs_lookup_block_group(trans->fs_info, bytenr); + if (!cache) + return -EINVAL; + + /* + * Fully cache the free space first so that our pin removes the free space + * from the cache. + */ + ret = btrfs_cache_block_group(cache, true); + if (ret) + goto out; + + pin_down_extent(trans, cache, bytenr, num_bytes, 0); + + /* remove us from the free space cache (if we're there at all) */ + ret = btrfs_remove_free_space(cache, bytenr, num_bytes); +out: + btrfs_put_block_group(cache); + return ret; +} + +static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 num_bytes) +{ + int ret; + struct btrfs_block_group *block_group; + + block_group = btrfs_lookup_block_group(fs_info, start); + if (!block_group) + return -EINVAL; + + ret = btrfs_cache_block_group(block_group, true); + if (ret) + goto out; + + ret = btrfs_remove_free_space(block_group, start, num_bytes); +out: + btrfs_put_block_group(block_group); + return ret; +} + +int btrfs_exclude_logged_extents(struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_file_extent_item *item; + struct btrfs_key key; + int found_type; + int i; + int ret = 0; + + if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) + return 0; + + for (i = 0; i < btrfs_header_nritems(eb); i++) { + btrfs_item_key_to_cpu(eb, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(eb, item); + if (found_type == BTRFS_FILE_EXTENT_INLINE) + continue; + if (btrfs_file_extent_disk_bytenr(eb, item) == 0) + continue; + key.objectid = btrfs_file_extent_disk_bytenr(eb, item); + key.offset = btrfs_file_extent_disk_num_bytes(eb, item); + ret = __exclude_logged_extent(fs_info, key.objectid, key.offset); + if (ret) + break; + } + + return ret; +} + +static void +btrfs_inc_block_group_reservations(struct btrfs_block_group *bg) +{ + atomic_inc(&bg->reservations); +} + +/* + * Returns the free cluster for the given space info and sets empty_cluster to + * what it should be based on the mount options. + */ +static struct btrfs_free_cluster * +fetch_cluster_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 *empty_cluster) +{ + struct btrfs_free_cluster *ret = NULL; + + *empty_cluster = 0; + if (btrfs_mixed_space_info(space_info)) + return ret; + + if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + ret = &fs_info->meta_alloc_cluster; + if (btrfs_test_opt(fs_info, SSD)) + *empty_cluster = SZ_2M; + else + *empty_cluster = SZ_64K; + } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && + btrfs_test_opt(fs_info, SSD_SPREAD)) { + *empty_cluster = SZ_2M; + ret = &fs_info->data_alloc_cluster; + } + + return ret; +} + +static int unpin_extent_range(struct btrfs_fs_info *fs_info, + u64 start, u64 end, + const bool return_free_space) +{ + struct btrfs_block_group *cache = NULL; + struct btrfs_space_info *space_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_free_cluster *cluster = NULL; + u64 len; + u64 total_unpinned = 0; + u64 empty_cluster = 0; + bool readonly; + + while (start <= end) { + readonly = false; + if (!cache || + start >= cache->start + cache->length) { + if (cache) + btrfs_put_block_group(cache); + total_unpinned = 0; + cache = btrfs_lookup_block_group(fs_info, start); + BUG_ON(!cache); /* Logic error */ + + cluster = fetch_cluster_info(fs_info, + cache->space_info, + &empty_cluster); + empty_cluster <<= 1; + } + + len = cache->start + cache->length - start; + len = min(len, end + 1 - start); + + if (return_free_space) + btrfs_add_free_space(cache, start, len); + + start += len; + total_unpinned += len; + space_info = cache->space_info; + + /* + * If this space cluster has been marked as fragmented and we've + * unpinned enough in this block group to potentially allow a + * cluster to be created inside of it go ahead and clear the + * fragmented check. + */ + if (cluster && cluster->fragmented && + total_unpinned > empty_cluster) { + spin_lock(&cluster->lock); + cluster->fragmented = 0; + spin_unlock(&cluster->lock); + } + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + cache->pinned -= len; + btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); + space_info->max_extent_size = 0; + if (cache->ro) { + space_info->bytes_readonly += len; + readonly = true; + } else if (btrfs_is_zoned(fs_info)) { + /* Need reset before reusing in a zoned block group */ + space_info->bytes_zone_unusable += len; + readonly = true; + } + spin_unlock(&cache->lock); + if (!readonly && return_free_space && + global_rsv->space_info == space_info) { + spin_lock(&global_rsv->lock); + if (!global_rsv->full) { + u64 to_add = min(len, global_rsv->size - + global_rsv->reserved); + + global_rsv->reserved += to_add; + btrfs_space_info_update_bytes_may_use(fs_info, + space_info, to_add); + if (global_rsv->reserved >= global_rsv->size) + global_rsv->full = 1; + len -= to_add; + } + spin_unlock(&global_rsv->lock); + } + /* Add to any tickets we may have */ + if (!readonly && return_free_space && len) + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); + } + + if (cache) + btrfs_put_block_group(cache); + return 0; +} + +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group *block_group, *tmp; + struct list_head *deleted_bgs; + struct extent_io_tree *unpin; + u64 start; + u64 end; + int ret; + + unpin = &trans->transaction->pinned_extents; + + while (!TRANS_ABORTED(trans)) { + struct extent_state *cached_state = NULL; + + mutex_lock(&fs_info->unused_bg_unpin_mutex); + if (!find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state)) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + break; + } + + if (btrfs_test_opt(fs_info, DISCARD_SYNC)) + ret = btrfs_discard_extent(fs_info, start, + end + 1 - start, NULL); + + clear_extent_dirty(unpin, start, end, &cached_state); + unpin_extent_range(fs_info, start, end, true); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + free_extent_state(cached_state); + cond_resched(); + } + + if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { + btrfs_discard_calc_delay(&fs_info->discard_ctl); + btrfs_discard_schedule_work(&fs_info->discard_ctl, true); + } + + /* + * Transaction is finished. We don't need the lock anymore. We + * do need to clean up the block groups in case of a transaction + * abort. + */ + deleted_bgs = &trans->transaction->deleted_bgs; + list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { + u64 trimmed = 0; + + ret = -EROFS; + if (!TRANS_ABORTED(trans)) + ret = btrfs_discard_extent(fs_info, + block_group->start, + block_group->length, + &trimmed); + + list_del_init(&block_group->bg_list); + btrfs_unfreeze_block_group(block_group); + btrfs_put_block_group(block_group); + + if (ret) { + const char *errstr = btrfs_decode_error(ret); + btrfs_warn(fs_info, + "discard failed while removing blockgroup: errno=%d %s", + ret, errstr); + } + } + + return 0; +} + +static int do_free_extent_accounting(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, bool is_data) +{ + int ret; + + if (is_data) { + struct btrfs_root *csum_root; + + csum_root = btrfs_csum_root(trans->fs_info, bytenr); + ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + + ret = add_to_free_space_tree(trans, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); + if (ret) + btrfs_abort_transaction(trans, ret); + + return ret; +} + +#define abort_and_dump(trans, path, fmt, args...) \ +({ \ + btrfs_abort_transaction(trans, -EUCLEAN); \ + btrfs_print_leaf(path->nodes[0]); \ + btrfs_crit(trans->fs_info, fmt, ##args); \ +}) + +/* + * Drop one or more refs of @node. + * + * 1. Locate the extent refs. + * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item. + * Locate it, then reduce the refs number or remove the ref line completely. + * + * 2. Update the refs count in EXTENT/METADATA_ITEM + * + * Inline backref case: + * + * in extent tree we have: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 + * refs 2 gen 6 flags DATA + * extent data backref root FS_TREE objectid 258 offset 0 count 1 + * extent data backref root FS_TREE objectid 257 offset 0 count 1 + * + * This function gets called with: + * + * node->bytenr = 13631488 + * node->num_bytes = 1048576 + * root_objectid = FS_TREE + * owner_objectid = 257 + * owner_offset = 0 + * refs_to_drop = 1 + * + * Then we should get some like: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 + * refs 1 gen 6 flags DATA + * extent data backref root FS_TREE objectid 258 offset 0 count 1 + * + * Keyed backref case: + * + * in extent tree we have: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 + * refs 754 gen 6 flags DATA + * [...] + * item 2 key (13631488 EXTENT_DATA_REF ) itemoff 3915 itemsize 28 + * extent data backref root FS_TREE objectid 866 offset 0 count 1 + * + * This function get called with: + * + * node->bytenr = 13631488 + * node->num_bytes = 1048576 + * root_objectid = FS_TREE + * owner_objectid = 866 + * owner_offset = 0 + * refs_to_drop = 1 + * + * Then we should get some like: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 + * refs 753 gen 6 flags DATA + * + * And that (13631488 EXTENT_DATA_REF ) gets removed. + */ +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, u64 parent, + u64 root_objectid, u64 owner_objectid, + u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_fs_info *info = trans->fs_info; + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_root *extent_root; + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + int ret; + int is_data; + int extent_slot = 0; + int found_extent = 0; + int num_to_del = 1; + u32 item_size; + u64 refs; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; + bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); + + extent_root = btrfs_extent_root(info, bytenr); + ASSERT(extent_root); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; + + if (!is_data && refs_to_drop != 1) { + btrfs_crit(info, +"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", + node->bytenr, refs_to_drop); + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } + + if (is_data) + skinny_metadata = false; + + ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, + parent, root_objectid, owner_objectid, + owner_offset); + if (ret == 0) { + /* + * Either the inline backref or the SHARED_DATA_REF/ + * SHARED_BLOCK_REF is found + * + * Here is a quick path to locate EXTENT/METADATA_ITEM. + * It's possible the EXTENT/METADATA_ITEM is near current slot. + */ + extent_slot = path->slots[0]; + while (extent_slot >= 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + extent_slot); + if (key.objectid != bytenr) + break; + if (key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) { + found_extent = 1; + break; + } + if (key.type == BTRFS_METADATA_ITEM_KEY && + key.offset == owner_objectid) { + found_extent = 1; + break; + } + + /* Quick path didn't find the EXTEMT/METADATA_ITEM */ + if (path->slots[0] - extent_slot > 5) + break; + extent_slot--; + } + + if (!found_extent) { + if (iref) { + abort_and_dump(trans, path, +"invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", + path->slots[0]); + ret = -EUCLEAN; + goto out; + } + /* Must be SHARED_* item, remove the backref first */ + ret = remove_extent_backref(trans, extent_root, path, + NULL, refs_to_drop, is_data); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + btrfs_release_path(path); + + /* Slow path to locate EXTENT/METADATA_ITEM */ + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + if (!is_data && skinny_metadata) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = owner_objectid; + } + + ret = btrfs_search_slot(trans, extent_root, + &key, path, -1, 1); + if (ret > 0 && skinny_metadata && path->slots[0]) { + /* + * Couldn't find our skinny metadata item, + * see if we have ye olde extent item. + */ + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) + ret = 0; + } + + if (ret > 0 && skinny_metadata) { + skinny_metadata = false; + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + btrfs_release_path(path); + ret = btrfs_search_slot(trans, extent_root, + &key, path, -1, 1); + } + + if (ret) { + if (ret > 0) + btrfs_print_leaf(path->nodes[0]); + btrfs_err(info, + "umm, got %d back from search, was looking for %llu, slot %d", + ret, bytenr, path->slots[0]); + } + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } + extent_slot = path->slots[0]; + } + } else if (WARN_ON(ret == -ENOENT)) { + abort_and_dump(trans, path, +"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", + bytenr, parent, root_objectid, owner_objectid, + owner_offset, path->slots[0]); + goto out; + } else { + btrfs_abort_transaction(trans, ret); + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size(leaf, extent_slot); + if (unlikely(item_size < sizeof(*ei))) { + ret = -EUCLEAN; + btrfs_err(trans->fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); + btrfs_abort_transaction(trans, ret); + goto out; + } + ei = btrfs_item_ptr(leaf, extent_slot, + struct btrfs_extent_item); + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && + key.type == BTRFS_EXTENT_ITEM_KEY) { + struct btrfs_tree_block_info *bi; + + if (item_size < sizeof(*ei) + sizeof(*bi)) { + abort_and_dump(trans, path, +"invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu", + key.objectid, key.type, key.offset, + path->slots[0], owner_objectid, item_size, + sizeof(*ei) + sizeof(*bi)); + ret = -EUCLEAN; + goto out; + } + bi = (struct btrfs_tree_block_info *)(ei + 1); + WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); + } + + refs = btrfs_extent_refs(leaf, ei); + if (refs < refs_to_drop) { + abort_and_dump(trans, path, + "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", + refs_to_drop, refs, bytenr, path->slots[0]); + ret = -EUCLEAN; + goto out; + } + refs -= refs_to_drop; + + if (refs > 0) { + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + /* + * In the case of inline back ref, reference count will + * be updated by remove_extent_backref + */ + if (iref) { + if (!found_extent) { + abort_and_dump(trans, path, +"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", + path->slots[0]); + ret = -EUCLEAN; + goto out; + } + } else { + btrfs_set_extent_refs(leaf, ei, refs); + btrfs_mark_buffer_dirty(trans, leaf); + } + if (found_extent) { + ret = remove_extent_backref(trans, extent_root, path, + iref, refs_to_drop, is_data); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + } + } else { + /* In this branch refs == 1 */ + if (found_extent) { + if (is_data && refs_to_drop != + extent_data_ref_count(path, iref)) { + abort_and_dump(trans, path, + "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", + extent_data_ref_count(path, iref), + refs_to_drop, path->slots[0]); + ret = -EUCLEAN; + goto out; + } + if (iref) { + if (path->slots[0] != extent_slot) { + abort_and_dump(trans, path, +"invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", + key.objectid, key.type, + key.offset, path->slots[0]); + ret = -EUCLEAN; + goto out; + } + } else { + /* + * No inline ref, we must be at SHARED_* item, + * And it's single ref, it must be: + * | extent_slot ||extent_slot + 1| + * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] + */ + if (path->slots[0] != extent_slot + 1) { + abort_and_dump(trans, path, + "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", + path->slots[0]); + ret = -EUCLEAN; + goto out; + } + path->slots[0] = extent_slot; + num_to_del = 2; + } + } + + ret = btrfs_del_items(trans, extent_root, path, path->slots[0], + num_to_del); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + btrfs_release_path(path); + + ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data); + } + btrfs_release_path(path); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * when we free an block, it is possible (and likely) that we free the last + * delayed ref for that extent as well. This searches the delayed ref tree for + * a given extent, and if there are no other delayed refs to be processed, it + * removes it from the tree. + */ +static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, + u64 bytenr) +{ + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + int ret = 0; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + if (!head) + goto out_delayed_unlock; + + spin_lock(&head->lock); + if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) + goto out; + + if (cleanup_extent_op(head) != NULL) + goto out; + + /* + * waiting for the lock here would deadlock. If someone else has it + * locked they are already in the process of dropping it anyway + */ + if (!mutex_trylock(&head->mutex)) + goto out; + + btrfs_delete_ref_head(delayed_refs, head); + head->processing = false; + + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + + BUG_ON(head->extent_op); + if (head->must_insert_reserved) + ret = 1; + + btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref_head(head); + return ret; +out: + spin_unlock(&head->lock); + +out_delayed_unlock: + spin_unlock(&delayed_refs->lock); + return 0; +} + +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + u64 root_id, + struct extent_buffer *buf, + u64 parent, int last_ref) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_ref generic_ref = { 0 }; + int ret; + + btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, + buf->start, buf->len, parent); + btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), + root_id, 0, false); + + if (root_id != BTRFS_TREE_LOG_OBJECTID) { + btrfs_ref_tree_mod(fs_info, &generic_ref); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); + BUG_ON(ret); /* -ENOMEM */ + } + + if (last_ref && btrfs_header_generation(buf) == trans->transid) { + struct btrfs_block_group *cache; + bool must_pin = false; + + if (root_id != BTRFS_TREE_LOG_OBJECTID) { + ret = check_ref_cleanup(trans, buf->start); + if (!ret) { + btrfs_redirty_list_add(trans->transaction, buf); + goto out; + } + } + + cache = btrfs_lookup_block_group(fs_info, buf->start); + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + pin_down_extent(trans, cache, buf->start, buf->len, 1); + btrfs_put_block_group(cache); + goto out; + } + + /* + * If there are tree mod log users we may have recorded mod log + * operations for this node. If we re-allocate this node we + * could replay operations on this node that happened when it + * existed in a completely different root. For example if it + * was part of root A, then was reallocated to root B, and we + * are doing a btrfs_old_search_slot(root b), we could replay + * operations that happened when the block was part of root A, + * giving us an inconsistent view of the btree. + * + * We are safe from races here because at this point no other + * node or root points to this extent buffer, so if after this + * check a new tree mod log user joins we will not have an + * existing log of operations on this node that we have to + * contend with. + */ + if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + must_pin = true; + + if (must_pin || btrfs_is_zoned(fs_info)) { + btrfs_redirty_list_add(trans->transaction, buf); + pin_down_extent(trans, cache, buf->start, buf->len, 1); + btrfs_put_block_group(cache); + goto out; + } + + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + + btrfs_add_free_space(cache, buf->start, buf->len); + btrfs_free_reserved_bytes(cache, buf->len, 0); + btrfs_put_block_group(cache); + trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); + } +out: + if (last_ref) { + /* + * Deleting the buffer, clear the corrupt flag since it doesn't + * matter anymore. + */ + clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); + } +} + +/* Can return -ENOMEM */ +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; + + if (btrfs_is_testing(fs_info)) + return 0; + + /* + * tree log blocks never actually go into the extent allocation + * tree, just update pinning info and exit early. + */ + if ((ref->type == BTRFS_REF_METADATA && + ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || + (ref->type == BTRFS_REF_DATA && + ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) { + /* unlocks the pinned mutex */ + btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); + ret = 0; + } else if (ref->type == BTRFS_REF_METADATA) { + ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); + } else { + ret = btrfs_add_delayed_data_ref(trans, ref, 0); + } + + if (!((ref->type == BTRFS_REF_METADATA && + ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || + (ref->type == BTRFS_REF_DATA && + ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID))) + btrfs_ref_tree_mod(fs_info, ref); + + return ret; +} + +enum btrfs_loop_type { + /* + * Start caching block groups but do not wait for progress or for them + * to be done. + */ + LOOP_CACHING_NOWAIT, + + /* + * Wait for the block group free_space >= the space we're waiting for if + * the block group isn't cached. + */ + LOOP_CACHING_WAIT, + + /* + * Allow allocations to happen from block groups that do not yet have a + * size classification. + */ + LOOP_UNSET_SIZE_CLASS, + + /* + * Allocate a chunk and then retry the allocation. + */ + LOOP_ALLOC_CHUNK, + + /* + * Ignore the size class restrictions for this allocation. + */ + LOOP_WRONG_SIZE_CLASS, + + /* + * Ignore the empty size, only try to allocate the number of bytes + * needed for this allocation. + */ + LOOP_NO_EMPTY_SIZE, +}; + +static inline void +btrfs_lock_block_group(struct btrfs_block_group *cache, + int delalloc) +{ + if (delalloc) + down_read(&cache->data_rwsem); +} + +static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, + int delalloc) +{ + btrfs_get_block_group(cache); + if (delalloc) + down_read(&cache->data_rwsem); +} + +static struct btrfs_block_group *btrfs_lock_cluster( + struct btrfs_block_group *block_group, + struct btrfs_free_cluster *cluster, + int delalloc) + __acquires(&cluster->refill_lock) +{ + struct btrfs_block_group *used_bg = NULL; + + spin_lock(&cluster->refill_lock); + while (1) { + used_bg = cluster->block_group; + if (!used_bg) + return NULL; + + if (used_bg == block_group) + return used_bg; + + btrfs_get_block_group(used_bg); + + if (!delalloc) + return used_bg; + + if (down_read_trylock(&used_bg->data_rwsem)) + return used_bg; + + spin_unlock(&cluster->refill_lock); + + /* We should only have one-level nested. */ + down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); + + spin_lock(&cluster->refill_lock); + if (used_bg == cluster->block_group) + return used_bg; + + up_read(&used_bg->data_rwsem); + btrfs_put_block_group(used_bg); + } +} + +static inline void +btrfs_release_block_group(struct btrfs_block_group *cache, + int delalloc) +{ + if (delalloc) + up_read(&cache->data_rwsem); + btrfs_put_block_group(cache); +} + +/* + * Helper function for find_free_extent(). + * + * Return -ENOENT to inform caller that we need fallback to unclustered mode. + * Return >0 to inform caller that we find nothing + * Return 0 means we have found a location and set ffe_ctl->found_offset. + */ +static int find_free_extent_clustered(struct btrfs_block_group *bg, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **cluster_bg_ret) +{ + struct btrfs_block_group *cluster_bg; + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; + u64 aligned_cluster; + u64 offset; + int ret; + + cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); + if (!cluster_bg) + goto refill_cluster; + if (cluster_bg != bg && (cluster_bg->ro || + !block_group_bits(cluster_bg, ffe_ctl->flags))) + goto release_cluster; + + offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, + ffe_ctl->num_bytes, cluster_bg->start, + &ffe_ctl->max_extent_size); + if (offset) { + /* We have a block, we're done */ + spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl); + *cluster_bg_ret = cluster_bg; + ffe_ctl->found_offset = offset; + return 0; + } + WARN_ON(last_ptr->block_group != cluster_bg); + +release_cluster: + /* + * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so + * lets just skip it and let the allocator find whatever block it can + * find. If we reach this point, we will have tried the cluster + * allocator plenty of times and not have found anything, so we are + * likely way too fragmented for the clustering stuff to find anything. + * + * However, if the cluster is taken from the current block group, + * release the cluster first, so that we stand a better chance of + * succeeding in the unclustered allocation. + */ + if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { + spin_unlock(&last_ptr->refill_lock); + btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); + return -ENOENT; + } + + /* This cluster didn't work out, free it and start over */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + + if (cluster_bg != bg) + btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); + +refill_cluster: + if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + return -ENOENT; + } + + aligned_cluster = max_t(u64, + ffe_ctl->empty_cluster + ffe_ctl->empty_size, + bg->full_stripe_len); + ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start, + ffe_ctl->num_bytes, aligned_cluster); + if (ret == 0) { + /* Now pull our allocation out of this cluster */ + offset = btrfs_alloc_from_cluster(bg, last_ptr, + ffe_ctl->num_bytes, ffe_ctl->search_start, + &ffe_ctl->max_extent_size); + if (offset) { + /* We found one, proceed */ + spin_unlock(&last_ptr->refill_lock); + ffe_ctl->found_offset = offset; + trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); + return 0; + } + } + /* + * At this point we either didn't find a cluster or we weren't able to + * allocate a block from our cluster. Free the cluster we've been + * trying to use, and go to the next block group. + */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + spin_unlock(&last_ptr->refill_lock); + return 1; +} + +/* + * Return >0 to inform caller that we find nothing + * Return 0 when we found an free extent and set ffe_ctrl->found_offset + */ +static int find_free_extent_unclustered(struct btrfs_block_group *bg, + struct find_free_extent_ctl *ffe_ctl) +{ + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; + u64 offset; + + /* + * We are doing an unclustered allocation, set the fragmented flag so + * we don't bother trying to setup a cluster again until we get more + * space. + */ + if (unlikely(last_ptr)) { + spin_lock(&last_ptr->lock); + last_ptr->fragmented = 1; + spin_unlock(&last_ptr->lock); + } + if (ffe_ctl->cached) { + struct btrfs_free_space_ctl *free_space_ctl; + + free_space_ctl = bg->free_space_ctl; + spin_lock(&free_space_ctl->tree_lock); + if (free_space_ctl->free_space < + ffe_ctl->num_bytes + ffe_ctl->empty_cluster + + ffe_ctl->empty_size) { + ffe_ctl->total_free_space = max_t(u64, + ffe_ctl->total_free_space, + free_space_ctl->free_space); + spin_unlock(&free_space_ctl->tree_lock); + return 1; + } + spin_unlock(&free_space_ctl->tree_lock); + } + + offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, + ffe_ctl->num_bytes, ffe_ctl->empty_size, + &ffe_ctl->max_extent_size); + if (!offset) + return 1; + ffe_ctl->found_offset = offset; + return 0; +} + +static int do_allocation_clustered(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + int ret; + + /* We want to try and use the cluster allocator, so lets look there */ + if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { + ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); + if (ret >= 0) + return ret; + /* ret == -ENOENT case falls through */ + } + + return find_free_extent_unclustered(block_group, ffe_ctl); +} + +/* + * Tree-log block group locking + * ============================ + * + * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which + * indicates the starting address of a block group, which is reserved only + * for tree-log metadata. + * + * Lock nesting + * ============ + * + * space_info::lock + * block_group::lock + * fs_info::treelog_bg_lock + */ + +/* + * Simple allocator for sequential-only block group. It only allows sequential + * allocation. No need to play with trees. This function also reserves the + * bytes as in btrfs_add_reserved_bytes. + */ +static int do_allocation_zoned(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_space_info *space_info = block_group->space_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + u64 start = block_group->start; + u64 num_bytes = ffe_ctl->num_bytes; + u64 avail; + u64 bytenr = block_group->start; + u64 log_bytenr; + u64 data_reloc_bytenr; + int ret = 0; + bool skip = false; + + ASSERT(btrfs_is_zoned(block_group->fs_info)); + + /* + * Do not allow non-tree-log blocks in the dedicated tree-log block + * group, and vice versa. + */ + spin_lock(&fs_info->treelog_bg_lock); + log_bytenr = fs_info->treelog_bg; + if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || + (!ffe_ctl->for_treelog && bytenr == log_bytenr))) + skip = true; + spin_unlock(&fs_info->treelog_bg_lock); + if (skip) + return 1; + + /* + * Do not allow non-relocation blocks in the dedicated relocation block + * group, and vice versa. + */ + spin_lock(&fs_info->relocation_bg_lock); + data_reloc_bytenr = fs_info->data_reloc_bg; + if (data_reloc_bytenr && + ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) || + (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr))) + skip = true; + spin_unlock(&fs_info->relocation_bg_lock); + if (skip) + return 1; + + /* Check RO and no space case before trying to activate it */ + spin_lock(&block_group->lock); + if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) { + ret = 1; + /* + * May need to clear fs_info->{treelog,data_reloc}_bg. + * Return the error after taking the locks. + */ + } + spin_unlock(&block_group->lock); + + /* Metadata block group is activated at write time. */ + if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && + !btrfs_zone_activate(block_group)) { + ret = 1; + /* + * May need to clear fs_info->{treelog,data_reloc}_bg. + * Return the error after taking the locks. + */ + } + + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + spin_lock(&fs_info->treelog_bg_lock); + spin_lock(&fs_info->relocation_bg_lock); + + if (ret) + goto out; + + ASSERT(!ffe_ctl->for_treelog || + block_group->start == fs_info->treelog_bg || + fs_info->treelog_bg == 0); + ASSERT(!ffe_ctl->for_data_reloc || + block_group->start == fs_info->data_reloc_bg || + fs_info->data_reloc_bg == 0); + + if (block_group->ro || + (!ffe_ctl->for_data_reloc && + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) { + ret = 1; + goto out; + } + + /* + * Do not allow currently using block group to be tree-log dedicated + * block group. + */ + if (ffe_ctl->for_treelog && !fs_info->treelog_bg && + (block_group->used || block_group->reserved)) { + ret = 1; + goto out; + } + + /* + * Do not allow currently used block group to be the data relocation + * dedicated block group. + */ + if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg && + (block_group->used || block_group->reserved)) { + ret = 1; + goto out; + } + + WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity); + avail = block_group->zone_capacity - block_group->alloc_offset; + if (avail < num_bytes) { + if (ffe_ctl->max_extent_size < avail) { + /* + * With sequential allocator, free space is always + * contiguous + */ + ffe_ctl->max_extent_size = avail; + ffe_ctl->total_free_space = avail; + } + ret = 1; + goto out; + } + + if (ffe_ctl->for_treelog && !fs_info->treelog_bg) + fs_info->treelog_bg = block_group->start; + + if (ffe_ctl->for_data_reloc) { + if (!fs_info->data_reloc_bg) + fs_info->data_reloc_bg = block_group->start; + /* + * Do not allow allocations from this block group, unless it is + * for data relocation. Compared to increasing the ->ro, setting + * the ->zoned_data_reloc_ongoing flag still allows nocow + * writers to come in. See btrfs_inc_nocow_writers(). + * + * We need to disable an allocation to avoid an allocation of + * regular (non-relocation data) extent. With mix of relocation + * extents and regular extents, we can dispatch WRITE commands + * (for relocation extents) and ZONE APPEND commands (for + * regular extents) at the same time to the same zone, which + * easily break the write pointer. + * + * Also, this flag avoids this block group to be zone finished. + */ + set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); + } + + ffe_ctl->found_offset = start + block_group->alloc_offset; + block_group->alloc_offset += num_bytes; + spin_lock(&ctl->tree_lock); + ctl->free_space -= num_bytes; + spin_unlock(&ctl->tree_lock); + + /* + * We do not check if found_offset is aligned to stripesize. The + * address is anyway rewritten when using zone append writing. + */ + + ffe_ctl->search_start = ffe_ctl->found_offset; + +out: + if (ret && ffe_ctl->for_treelog) + fs_info->treelog_bg = 0; + if (ret && ffe_ctl->for_data_reloc) + fs_info->data_reloc_bg = 0; + spin_unlock(&fs_info->relocation_bg_lock); + spin_unlock(&fs_info->treelog_bg_lock); + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + return ret; +} + +static int do_allocation(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return do_allocation_clustered(block_group, ffe_ctl, bg_ret); + case BTRFS_EXTENT_ALLOC_ZONED: + return do_allocation_zoned(block_group, ffe_ctl, bg_ret); + default: + BUG(); + } +} + +static void release_block_group(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + int delalloc) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + ffe_ctl->retry_uncached = false; + break; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Nothing to do */ + break; + default: + BUG(); + } + + BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != + ffe_ctl->index); + btrfs_release_block_group(block_group, delalloc); +} + +static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_key *ins) +{ + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; + + if (!ffe_ctl->use_cluster && last_ptr) { + spin_lock(&last_ptr->lock); + last_ptr->window_start = ins->objectid; + spin_unlock(&last_ptr->lock); + } +} + +static void found_extent(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_key *ins) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + found_extent_clustered(ffe_ctl, ins); + break; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Nothing to do */ + break; + default: + BUG(); + } +} + +static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + /* Block group's activeness is not a requirement for METADATA block groups. */ + if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)) + return 0; + + /* If we can activate new zone, just allocate a chunk and use it */ + if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) + return 0; + + /* + * We already reached the max active zones. Try to finish one block + * group to make a room for a new block group. This is only possible + * for a data block group because btrfs_zone_finish() may need to wait + * for a running transaction which can cause a deadlock for metadata + * allocation. + */ + if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + int ret = btrfs_zone_finish_one_bg(fs_info); + + if (ret == 1) + return 0; + else if (ret < 0) + return ret; + } + + /* + * If we have enough free space left in an already active block group + * and we can't activate any other zone now, do not allow allocating a + * new chunk and let find_free_extent() retry with a smaller size. + */ + if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size) + return -ENOSPC; + + /* + * Even min_alloc_size is not left in any block groups. Since we cannot + * activate a new block group, allocating it may not help. Let's tell a + * caller to try again and hope it progress something by writing some + * parts of the region. That is only possible for data block groups, + * where a part of the region can be written. + */ + if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) + return -EAGAIN; + + /* + * We cannot activate a new block group and no enough space left in any + * block groups. So, allocating a new block group may not help. But, + * there is nothing to do anyway, so let's go with it. + */ + return 0; +} + +static int can_allocate_chunk(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return 0; + case BTRFS_EXTENT_ALLOC_ZONED: + return can_allocate_chunk_zoned(fs_info, ffe_ctl); + default: + BUG(); + } +} + +/* + * Return >0 means caller needs to re-search for free extent + * Return 0 means we have the needed free extent. + * Return <0 means we failed to locate any free extent. + */ +static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, + struct btrfs_key *ins, + struct find_free_extent_ctl *ffe_ctl, + bool full_search) +{ + struct btrfs_root *root = fs_info->chunk_root; + int ret; + + if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && + ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) + ffe_ctl->orig_have_caching_bg = true; + + if (ins->objectid) { + found_extent(ffe_ctl, ins); + return 0; + } + + if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) + return 1; + + ffe_ctl->index++; + if (ffe_ctl->index < BTRFS_NR_RAID_TYPES) + return 1; + + /* See the comments for btrfs_loop_type for an explanation of the phases. */ + if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { + ffe_ctl->index = 0; + /* + * We want to skip the LOOP_CACHING_WAIT step if we don't have + * any uncached bgs and we've already done a full search + * through. + */ + if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && + (!ffe_ctl->orig_have_caching_bg && full_search)) + ffe_ctl->loop++; + ffe_ctl->loop++; + + if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { + struct btrfs_trans_handle *trans; + int exist = 0; + + /* Check if allocation policy allows to create a new chunk */ + ret = can_allocate_chunk(fs_info, ffe_ctl); + if (ret) + return ret; + + trans = current->journal_info; + if (trans) + exist = 1; + else + trans = btrfs_join_transaction(root); + + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + return ret; + } + + ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, + CHUNK_ALLOC_FORCE_FOR_EXTENT); + + /* Do not bail out on ENOSPC since we can do more. */ + if (ret == -ENOSPC) { + ret = 0; + ffe_ctl->loop++; + } + else if (ret < 0) + btrfs_abort_transaction(trans, ret); + else + ret = 0; + if (!exist) + btrfs_end_transaction(trans); + if (ret) + return ret; + } + + if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { + if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED) + return -ENOSPC; + + /* + * Don't loop again if we already have no empty_size and + * no empty_cluster. + */ + if (ffe_ctl->empty_size == 0 && + ffe_ctl->empty_cluster == 0) + return -ENOSPC; + ffe_ctl->empty_size = 0; + ffe_ctl->empty_cluster = 0; + } + return 1; + } + return -ENOSPC; +} + +static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group *bg) +{ + if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) + return true; + if (!btrfs_block_group_should_use_size_class(bg)) + return true; + if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) + return true; + if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && + bg->size_class == BTRFS_BG_SZ_NONE) + return true; + return ffe_ctl->size_class == bg->size_class; +} + +static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, + struct btrfs_key *ins) +{ + /* + * If our free space is heavily fragmented we may not be able to make + * big contiguous allocations, so instead of doing the expensive search + * for free space, simply return ENOSPC with our max_extent_size so we + * can go ahead and search for a more manageable chunk. + * + * If our max_extent_size is large enough for our allocation simply + * disable clustering since we will likely not be able to find enough + * space to create a cluster and induce latency trying. + */ + if (space_info->max_extent_size) { + spin_lock(&space_info->lock); + if (space_info->max_extent_size && + ffe_ctl->num_bytes > space_info->max_extent_size) { + ins->offset = space_info->max_extent_size; + spin_unlock(&space_info->lock); + return -ENOSPC; + } else if (space_info->max_extent_size) { + ffe_ctl->use_cluster = false; + } + spin_unlock(&space_info->lock); + } + + ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info, + &ffe_ctl->empty_cluster); + if (ffe_ctl->last_ptr) { + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; + + spin_lock(&last_ptr->lock); + if (last_ptr->block_group) + ffe_ctl->hint_byte = last_ptr->window_start; + if (last_ptr->fragmented) { + /* + * We still set window_start so we can keep track of the + * last place we found an allocation to try and save + * some time. + */ + ffe_ctl->hint_byte = last_ptr->window_start; + ffe_ctl->use_cluster = false; + } + spin_unlock(&last_ptr->lock); + } + + return 0; +} + +static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg) + ffe_ctl->hint_byte = fs_info->treelog_bg; + spin_unlock(&fs_info->treelog_bg_lock); + } else if (ffe_ctl->for_data_reloc) { + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg) + ffe_ctl->hint_byte = fs_info->data_reloc_bg; + spin_unlock(&fs_info->relocation_bg_lock); + } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + struct btrfs_block_group *block_group; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { + /* + * No lock is OK here because avail is monotinically + * decreasing, and this is just a hint. + */ + u64 avail = block_group->zone_capacity - block_group->alloc_offset; + + if (block_group_bits(block_group, ffe_ctl->flags) && + avail >= ffe_ctl->num_bytes) { + ffe_ctl->hint_byte = block_group->start; + break; + } + } + spin_unlock(&fs_info->zone_active_bgs_lock); + } + + return 0; +} + +static int prepare_allocation(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, + struct btrfs_key *ins) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return prepare_allocation_clustered(fs_info, ffe_ctl, + space_info, ins); + case BTRFS_EXTENT_ALLOC_ZONED: + return prepare_allocation_zoned(fs_info, ffe_ctl); + default: + BUG(); + } +} + +/* + * walks the btree of allocated extents and find a hole of a given size. + * The key ins is changed to record the hole: + * ins->objectid == start position + * ins->flags = BTRFS_EXTENT_ITEM_KEY + * ins->offset == the size of the hole. + * Any available blocks before search_start are skipped. + * + * If there is no suitable free space, we will record the max size of + * the free space extent currently. + * + * The overall logic and call chain: + * + * find_free_extent() + * |- Iterate through all block groups + * | |- Get a valid block group + * | |- Try to do clustered allocation in that block group + * | |- Try to do unclustered allocation in that block group + * | |- Check if the result is valid + * | | |- If valid, then exit + * | |- Jump to next block group + * | + * |- Push harder to find free extents + * |- If not found, re-iterate all block groups + */ +static noinline int find_free_extent(struct btrfs_root *root, + struct btrfs_key *ins, + struct find_free_extent_ctl *ffe_ctl) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; + int cache_block_group_error = 0; + struct btrfs_block_group *block_group = NULL; + struct btrfs_space_info *space_info; + bool full_search = false; + + WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize); + + ffe_ctl->search_start = 0; + /* For clustered allocation */ + ffe_ctl->empty_cluster = 0; + ffe_ctl->last_ptr = NULL; + ffe_ctl->use_cluster = true; + ffe_ctl->have_caching_bg = false; + ffe_ctl->orig_have_caching_bg = false; + ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); + ffe_ctl->loop = 0; + ffe_ctl->retry_uncached = false; + ffe_ctl->cached = 0; + ffe_ctl->max_extent_size = 0; + ffe_ctl->total_free_space = 0; + ffe_ctl->found_offset = 0; + ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes); + + if (btrfs_is_zoned(fs_info)) + ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; + + ins->type = BTRFS_EXTENT_ITEM_KEY; + ins->objectid = 0; + ins->offset = 0; + + trace_find_free_extent(root, ffe_ctl); + + space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); + if (!space_info) { + btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags); + return -ENOSPC; + } + + ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins); + if (ret < 0) + return ret; + + ffe_ctl->search_start = max(ffe_ctl->search_start, + first_logical_byte(fs_info)); + ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte); + if (ffe_ctl->search_start == ffe_ctl->hint_byte) { + block_group = btrfs_lookup_block_group(fs_info, + ffe_ctl->search_start); + /* + * we don't want to use the block group if it doesn't match our + * allocation bits, or if its not cached. + * + * However if we are re-searching with an ideal block group + * picked out then we don't care that the block group is cached. + */ + if (block_group && block_group_bits(block_group, ffe_ctl->flags) && + block_group->cached != BTRFS_CACHE_NO) { + down_read(&space_info->groups_sem); + if (list_empty(&block_group->list) || + block_group->ro) { + /* + * someone is removing this block group, + * we can't jump into the have_block_group + * target because our list pointers are not + * valid + */ + btrfs_put_block_group(block_group); + up_read(&space_info->groups_sem); + } else { + ffe_ctl->index = btrfs_bg_flags_to_raid_index( + block_group->flags); + btrfs_lock_block_group(block_group, + ffe_ctl->delalloc); + ffe_ctl->hinted = true; + goto have_block_group; + } + } else if (block_group) { + btrfs_put_block_group(block_group); + } + } +search: + trace_find_free_extent_search_loop(root, ffe_ctl); + ffe_ctl->have_caching_bg = false; + if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || + ffe_ctl->index == 0) + full_search = true; + down_read(&space_info->groups_sem); + list_for_each_entry(block_group, + &space_info->block_groups[ffe_ctl->index], list) { + struct btrfs_block_group *bg_ret; + + ffe_ctl->hinted = false; + /* If the block group is read-only, we can skip it entirely. */ + if (unlikely(block_group->ro)) { + if (ffe_ctl->for_treelog) + btrfs_clear_treelog_bg(block_group); + if (ffe_ctl->for_data_reloc) + btrfs_clear_data_reloc_bg(block_group); + continue; + } + + btrfs_grab_block_group(block_group, ffe_ctl->delalloc); + ffe_ctl->search_start = block_group->start; + + /* + * this can happen if we end up cycling through all the + * raid types, but we want to make sure we only allocate + * for the proper type. + */ + if (!block_group_bits(block_group, ffe_ctl->flags)) { + u64 extra = BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1_MASK | + BTRFS_BLOCK_GROUP_RAID56_MASK | + BTRFS_BLOCK_GROUP_RAID10; + + /* + * if they asked for extra copies and this block group + * doesn't provide them, bail. This does allow us to + * fill raid0 from raid1. + */ + if ((ffe_ctl->flags & extra) && !(block_group->flags & extra)) + goto loop; + + /* + * This block group has different flags than we want. + * It's possible that we have MIXED_GROUP flag but no + * block group is mixed. Just skip such block group. + */ + btrfs_release_block_group(block_group, ffe_ctl->delalloc); + continue; + } + +have_block_group: + trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); + ffe_ctl->cached = btrfs_block_group_done(block_group); + if (unlikely(!ffe_ctl->cached)) { + ffe_ctl->have_caching_bg = true; + ret = btrfs_cache_block_group(block_group, false); + + /* + * If we get ENOMEM here or something else we want to + * try other block groups, because it may not be fatal. + * However if we can't find anything else we need to + * save our return here so that we return the actual + * error that caused problems, not ENOSPC. + */ + if (ret < 0) { + if (!cache_block_group_error) + cache_block_group_error = ret; + ret = 0; + goto loop; + } + ret = 0; + } + + if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) { + if (!cache_block_group_error) + cache_block_group_error = -EIO; + goto loop; + } + + if (!find_free_extent_check_size_class(ffe_ctl, block_group)) + goto loop; + + bg_ret = NULL; + ret = do_allocation(block_group, ffe_ctl, &bg_ret); + if (ret > 0) + goto loop; + + if (bg_ret && bg_ret != block_group) { + btrfs_release_block_group(block_group, ffe_ctl->delalloc); + block_group = bg_ret; + } + + /* Checks */ + ffe_ctl->search_start = round_up(ffe_ctl->found_offset, + fs_info->stripesize); + + /* move on to the next group */ + if (ffe_ctl->search_start + ffe_ctl->num_bytes > + block_group->start + block_group->length) { + btrfs_add_free_space_unused(block_group, + ffe_ctl->found_offset, + ffe_ctl->num_bytes); + goto loop; + } + + if (ffe_ctl->found_offset < ffe_ctl->search_start) + btrfs_add_free_space_unused(block_group, + ffe_ctl->found_offset, + ffe_ctl->search_start - ffe_ctl->found_offset); + + ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, + ffe_ctl->num_bytes, + ffe_ctl->delalloc, + ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS); + if (ret == -EAGAIN) { + btrfs_add_free_space_unused(block_group, + ffe_ctl->found_offset, + ffe_ctl->num_bytes); + goto loop; + } + btrfs_inc_block_group_reservations(block_group); + + /* we are all good, lets return */ + ins->objectid = ffe_ctl->search_start; + ins->offset = ffe_ctl->num_bytes; + + trace_btrfs_reserve_extent(block_group, ffe_ctl); + btrfs_release_block_group(block_group, ffe_ctl->delalloc); + break; +loop: + if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && + !ffe_ctl->retry_uncached) { + ffe_ctl->retry_uncached = true; + btrfs_wait_block_group_cache_progress(block_group, + ffe_ctl->num_bytes + + ffe_ctl->empty_cluster + + ffe_ctl->empty_size); + goto have_block_group; + } + release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); + cond_resched(); + } + up_read(&space_info->groups_sem); + + ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search); + if (ret > 0) + goto search; + + if (ret == -ENOSPC && !cache_block_group_error) { + /* + * Use ffe_ctl->total_free_space as fallback if we can't find + * any contiguous hole. + */ + if (!ffe_ctl->max_extent_size) + ffe_ctl->max_extent_size = ffe_ctl->total_free_space; + spin_lock(&space_info->lock); + space_info->max_extent_size = ffe_ctl->max_extent_size; + spin_unlock(&space_info->lock); + ins->offset = ffe_ctl->max_extent_size; + } else if (ret == -ENOSPC) { + ret = cache_block_group_error; + } + return ret; +} + +/* + * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a + * hole that is at least as big as @num_bytes. + * + * @root - The root that will contain this extent + * + * @ram_bytes - The amount of space in ram that @num_bytes take. This + * is used for accounting purposes. This value differs + * from @num_bytes only in the case of compressed extents. + * + * @num_bytes - Number of bytes to allocate on-disk. + * + * @min_alloc_size - Indicates the minimum amount of space that the + * allocator should try to satisfy. In some cases + * @num_bytes may be larger than what is required and if + * the filesystem is fragmented then allocation fails. + * However, the presence of @min_alloc_size gives a + * chance to try and satisfy the smaller allocation. + * + * @empty_size - A hint that you plan on doing more COW. This is the + * size in bytes the allocator should try to find free + * next to the block it returns. This is just a hint and + * may be ignored by the allocator. + * + * @hint_byte - Hint to the allocator to start searching above the byte + * address passed. It might be ignored. + * + * @ins - This key is modified to record the found hole. It will + * have the following values: + * ins->objectid == start position + * ins->flags = BTRFS_EXTENT_ITEM_KEY + * ins->offset == the size of the hole. + * + * @is_data - Boolean flag indicating whether an extent is + * allocated for data (true) or metadata (false) + * + * @delalloc - Boolean flag indicating whether this allocation is for + * delalloc or not. If 'true' data_rwsem of block groups + * is going to be acquired. + * + * + * Returns 0 when an allocation succeeded or < 0 when an error occurred. In + * case -ENOSPC is returned then @ins->offset will contain the size of the + * largest available hole the allocator managed to find. + */ +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + struct btrfs_key *ins, int is_data, int delalloc) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct find_free_extent_ctl ffe_ctl = {}; + bool final_tried = num_bytes == min_alloc_size; + u64 flags; + int ret; + bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data); + + flags = get_alloc_profile_by_root(root, is_data); +again: + WARN_ON(num_bytes < fs_info->sectorsize); + + ffe_ctl.ram_bytes = ram_bytes; + ffe_ctl.num_bytes = num_bytes; + ffe_ctl.min_alloc_size = min_alloc_size; + ffe_ctl.empty_size = empty_size; + ffe_ctl.flags = flags; + ffe_ctl.delalloc = delalloc; + ffe_ctl.hint_byte = hint_byte; + ffe_ctl.for_treelog = for_treelog; + ffe_ctl.for_data_reloc = for_data_reloc; + + ret = find_free_extent(root, ins, &ffe_ctl); + if (!ret && !is_data) { + btrfs_dec_block_group_reservations(fs_info, ins->objectid); + } else if (ret == -ENOSPC) { + if (!final_tried && ins->offset) { + num_bytes = min(num_bytes >> 1, ins->offset); + num_bytes = round_down(num_bytes, + fs_info->sectorsize); + num_bytes = max(num_bytes, min_alloc_size); + ram_bytes = num_bytes; + if (num_bytes == min_alloc_size) + final_tried = true; + goto again; + } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + struct btrfs_space_info *sinfo; + + sinfo = btrfs_find_space_info(fs_info, flags); + btrfs_err(fs_info, + "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d", + flags, num_bytes, for_treelog, for_data_reloc); + if (sinfo) + btrfs_dump_space_info(fs_info, sinfo, + num_bytes, 1); + } + } + + return ret; +} + +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 len, int delalloc) +{ + struct btrfs_block_group *cache; + + cache = btrfs_lookup_block_group(fs_info, start); + if (!cache) { + btrfs_err(fs_info, "Unable to find block group for %llu", + start); + return -ENOSPC; + } + + btrfs_add_free_space(cache, start, len); + btrfs_free_reserved_bytes(cache, len, delalloc); + trace_btrfs_reserved_extent_free(fs_info, start, len); + + btrfs_put_block_group(cache); + return 0; +} + +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, + u64 len) +{ + struct btrfs_block_group *cache; + int ret = 0; + + cache = btrfs_lookup_block_group(trans->fs_info, start); + if (!cache) { + btrfs_err(trans->fs_info, "unable to find block group for %llu", + start); + return -ENOSPC; + } + + ret = pin_down_extent(trans, cache, start, len, 1); + btrfs_put_block_group(cache); + return ret; +} + +static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; + + ret = remove_from_free_space_tree(trans, bytenr, num_bytes); + if (ret) + return ret; + + ret = btrfs_update_block_group(trans, bytenr, num_bytes, true); + if (ret) { + ASSERT(!ret); + btrfs_err(fs_info, "update block group failed for %llu %llu", + bytenr, num_bytes); + return ret; + } + + trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes); + return 0; +} + +static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + u64 parent, u64 root_objectid, + u64 flags, u64 owner, u64 offset, + struct btrfs_key *ins, int ref_mod) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *extent_root; + int ret; + struct btrfs_extent_item *extent_item; + struct btrfs_extent_inline_ref *iref; + struct btrfs_path *path; + struct extent_buffer *leaf; + int type; + u32 size; + + if (parent > 0) + type = BTRFS_SHARED_DATA_REF_KEY; + else + type = BTRFS_EXTENT_DATA_REF_KEY; + + size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + extent_root = btrfs_extent_root(fs_info, ins->objectid); + ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size); + if (ret) { + btrfs_free_path(path); + return ret; + } + + leaf = path->nodes[0]; + extent_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, extent_item, ref_mod); + btrfs_set_extent_generation(leaf, extent_item, trans->transid); + btrfs_set_extent_flags(leaf, extent_item, + flags | BTRFS_EXTENT_FLAG_DATA); + + iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (parent > 0) { + struct btrfs_shared_data_ref *ref; + ref = (struct btrfs_shared_data_ref *)(iref + 1); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); + } else { + struct btrfs_extent_data_ref *ref; + ref = (struct btrfs_extent_data_ref *)(&iref->offset); + btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, ref, owner); + btrfs_set_extent_data_ref_offset(leaf, ref, offset); + btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); + } + + btrfs_mark_buffer_dirty(trans, path->nodes[0]); + btrfs_free_path(path); + + return alloc_reserved_extent(trans, ins->objectid, ins->offset); +} + +static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *extent_root; + int ret; + struct btrfs_extent_item *extent_item; + struct btrfs_key extent_key; + struct btrfs_tree_block_info *block_info; + struct btrfs_extent_inline_ref *iref; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_delayed_tree_ref *ref; + u32 size = sizeof(*extent_item) + sizeof(*iref); + u64 flags = extent_op->flags_to_set; + bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); + + ref = btrfs_delayed_node_to_tree_ref(node); + + extent_key.objectid = node->bytenr; + if (skinny_metadata) { + extent_key.offset = ref->level; + extent_key.type = BTRFS_METADATA_ITEM_KEY; + } else { + extent_key.offset = node->num_bytes; + extent_key.type = BTRFS_EXTENT_ITEM_KEY; + size += sizeof(*block_info); + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + extent_root = btrfs_extent_root(fs_info, extent_key.objectid); + ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key, + size); + if (ret) { + btrfs_free_path(path); + return ret; + } + + leaf = path->nodes[0]; + extent_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, extent_item, 1); + btrfs_set_extent_generation(leaf, extent_item, trans->transid); + btrfs_set_extent_flags(leaf, extent_item, + flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); + + if (skinny_metadata) { + iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + } else { + block_info = (struct btrfs_tree_block_info *)(extent_item + 1); + btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); + btrfs_set_tree_block_level(leaf, block_info, ref->level); + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + } + + if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_SHARED_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); + } else { + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_TREE_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); + } + + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_free_path(path); + + return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); +} + +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 owner, + u64 offset, u64 ram_bytes, + struct btrfs_key *ins) +{ + struct btrfs_ref generic_ref = { 0 }; + + BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + + btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, + ins->objectid, ins->offset, 0); + btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, + offset, 0, false); + btrfs_ref_tree_mod(root->fs_info, &generic_ref); + + return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); +} + +/* + * this is used by the tree logging recovery code. It records that + * an extent has been allocated and makes sure to clear the free + * space cache bits as well + */ +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; + struct btrfs_block_group *block_group; + struct btrfs_space_info *space_info; + + /* + * Mixed block groups will exclude before processing the log so we only + * need to do the exclude dance if this fs isn't mixed. + */ + if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { + ret = __exclude_logged_extent(fs_info, ins->objectid, + ins->offset); + if (ret) + return ret; + } + + block_group = btrfs_lookup_block_group(fs_info, ins->objectid); + if (!block_group) + return -EINVAL; + + space_info = block_group->space_info; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + space_info->bytes_reserved += ins->offset; + block_group->reserved += ins->offset; + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + + ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, + offset, ins, 1); + if (ret) + btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); + btrfs_put_block_group(block_group); + return ret; +} + +static struct extent_buffer * +btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 bytenr, int level, u64 owner, + enum btrfs_lock_nesting nest) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *buf; + u64 lockdep_owner = owner; + + buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level); + if (IS_ERR(buf)) + return buf; + + /* + * Extra safety check in case the extent tree is corrupted and extent + * allocator chooses to use a tree block which is already used and + * locked. + */ + if (buf->lock_owner == current->pid) { + btrfs_err_rl(fs_info, +"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", + buf->start, btrfs_header_owner(buf), current->pid); + free_extent_buffer(buf); + return ERR_PTR(-EUCLEAN); + } + + /* + * The reloc trees are just snapshots, so we need them to appear to be + * just like any other fs tree WRT lockdep. + * + * The exception however is in replace_path() in relocation, where we + * hold the lock on the original fs root and then search for the reloc + * root. At that point we need to make sure any reloc root buffers are + * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make + * lockdep happy. + */ + if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID && + !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) + lockdep_owner = BTRFS_FS_TREE_OBJECTID; + + /* btrfs_clear_buffer_dirty() accesses generation field. */ + btrfs_set_header_generation(buf, trans->transid); + + /* + * This needs to stay, because we could allocate a freed block from an + * old tree into a new tree, so we need to make sure this new block is + * set to the appropriate level and owner. + */ + btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); + + __btrfs_tree_lock(buf, nest); + btrfs_clear_buffer_dirty(trans, buf); + clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); + clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); + + set_extent_buffer_uptodate(buf); + + memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); + btrfs_set_header_level(buf, level); + btrfs_set_header_bytenr(buf, buf->start); + btrfs_set_header_generation(buf, trans->transid); + btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(buf, owner); + write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); + write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + buf->log_index = root->log_transid % 2; + /* + * we allow two log transactions at a time, use different + * EXTENT bit to differentiate dirty pages. + */ + if (buf->log_index == 0) + set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_DIRTY, NULL); + else + set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_NEW, NULL); + } else { + buf->log_index = -1; + set_extent_bit(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, EXTENT_DIRTY, NULL); + } + /* this returns a buffer locked for blocking */ + return buf; +} + +/* + * finds a free extent and does all the dirty work required for allocation + * returns the tree buffer or an ERR_PTR on error. + */ +struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + const struct btrfs_disk_key *key, + int level, u64 hint, + u64 empty_size, + enum btrfs_lock_nesting nest) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key ins; + struct btrfs_block_rsv *block_rsv; + struct extent_buffer *buf; + struct btrfs_delayed_extent_op *extent_op; + struct btrfs_ref generic_ref = { 0 }; + u64 flags = 0; + int ret; + u32 blocksize = fs_info->nodesize; + bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (btrfs_is_testing(fs_info)) { + buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, + level, root_objectid, nest); + if (!IS_ERR(buf)) + root->alloc_bytenr += blocksize; + return buf; + } +#endif + + block_rsv = btrfs_use_block_rsv(trans, root, blocksize); + if (IS_ERR(block_rsv)) + return ERR_CAST(block_rsv); + + ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, + empty_size, hint, &ins, 0, 0); + if (ret) + goto out_unuse; + + buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, + root_objectid, nest); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_free_reserved; + } + + if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent == 0) + parent = ins.objectid; + flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else + BUG_ON(parent > 0); + + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { + extent_op = btrfs_alloc_delayed_extent_op(); + if (!extent_op) { + ret = -ENOMEM; + goto out_free_buf; + } + if (key) + memcpy(&extent_op->key, key, sizeof(extent_op->key)); + else + memset(&extent_op->key, 0, sizeof(extent_op->key)); + extent_op->flags_to_set = flags; + extent_op->update_key = skinny_metadata ? false : true; + extent_op->update_flags = true; + extent_op->level = level; + + btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, + ins.objectid, ins.offset, parent); + btrfs_init_tree_ref(&generic_ref, level, root_objectid, + root->root_key.objectid, false); + btrfs_ref_tree_mod(fs_info, &generic_ref); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); + if (ret) + goto out_free_delayed; + } + return buf; + +out_free_delayed: + btrfs_free_delayed_extent_op(extent_op); +out_free_buf: + btrfs_tree_unlock(buf); + free_extent_buffer(buf); +out_free_reserved: + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); +out_unuse: + btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); + return ERR_PTR(ret); +} + +struct walk_control { + u64 refs[BTRFS_MAX_LEVEL]; + u64 flags[BTRFS_MAX_LEVEL]; + struct btrfs_key update_progress; + struct btrfs_key drop_progress; + int drop_level; + int stage; + int level; + int shared_level; + int update_ref; + int keep_locks; + int reada_slot; + int reada_count; + int restarted; +}; + +#define DROP_REFERENCE 1 +#define UPDATE_BACKREF 2 + +static noinline void reada_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct walk_control *wc, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 bytenr; + u64 generation; + u64 refs; + u64 flags; + u32 nritems; + struct btrfs_key key; + struct extent_buffer *eb; + int ret; + int slot; + int nread = 0; + + if (path->slots[wc->level] < wc->reada_slot) { + wc->reada_count = wc->reada_count * 2 / 3; + wc->reada_count = max(wc->reada_count, 2); + } else { + wc->reada_count = wc->reada_count * 3 / 2; + wc->reada_count = min_t(int, wc->reada_count, + BTRFS_NODEPTRS_PER_BLOCK(fs_info)); + } + + eb = path->nodes[wc->level]; + nritems = btrfs_header_nritems(eb); + + for (slot = path->slots[wc->level]; slot < nritems; slot++) { + if (nread >= wc->reada_count) + break; + + cond_resched(); + bytenr = btrfs_node_blockptr(eb, slot); + generation = btrfs_node_ptr_generation(eb, slot); + + if (slot == path->slots[wc->level]) + goto reada; + + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) + continue; + + /* We don't lock the tree block, it's OK to be racy here */ + ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, + wc->level - 1, 1, &refs, + &flags); + /* We don't care about errors in readahead. */ + if (ret < 0) + continue; + BUG_ON(refs == 0); + + if (wc->stage == DROP_REFERENCE) { + if (refs == 1) + goto reada; + + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; + if (!wc->update_ref || + generation <= root->root_key.offset) + continue; + btrfs_node_key_to_cpu(eb, &key, slot); + ret = btrfs_comp_cpu_keys(&key, + &wc->update_progress); + if (ret < 0) + continue; + } else { + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; + } +reada: + btrfs_readahead_node_child(eb, slot); + nread++; + } + wc->reada_slot = slot; +} + +/* + * helper to process tree block while walking down the tree. + * + * when wc->stage == UPDATE_BACKREF, this function updates + * back refs for pointers in the block. + * + * NOTE: return value 1 means we should stop walking down. + */ +static noinline int walk_down_proc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int lookup_info) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; + u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; + int ret; + + if (wc->stage == UPDATE_BACKREF && + btrfs_header_owner(eb) != root->root_key.objectid) + return 1; + + /* + * when reference count of tree block is 1, it won't increase + * again. once full backref flag is set, we never clear it. + */ + if (lookup_info && + ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || + (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { + BUG_ON(!path->locks[level]); + ret = btrfs_lookup_extent_info(trans, fs_info, + eb->start, level, 1, + &wc->refs[level], + &wc->flags[level]); + BUG_ON(ret == -ENOMEM); + if (ret) + return ret; + BUG_ON(wc->refs[level] == 0); + } + + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level] > 1) + return 1; + + if (path->locks[level] && !wc->keep_locks) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + } + return 0; + } + + /* wc->stage == UPDATE_BACKREF */ + if (!(wc->flags[level] & flag)) { + BUG_ON(!path->locks[level]); + ret = btrfs_inc_ref(trans, root, eb, 1); + BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_dec_ref(trans, root, eb, 0); + BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_set_disk_extent_flags(trans, eb, flag); + BUG_ON(ret); /* -ENOMEM */ + wc->flags[level] |= flag; + } + + /* + * the block is shared by multiple trees, so it's not good to + * keep the tree lock + */ + if (path->locks[level] && level > 0) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + } + return 0; +} + +/* + * This is used to verify a ref exists for this root to deal with a bug where we + * would have a drop_progress key that hadn't been updated properly. + */ +static int check_ref_exists(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 parent, + int level) +{ + struct btrfs_path *path; + struct btrfs_extent_inline_ref *iref; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = lookup_extent_backref(trans, path, &iref, bytenr, + root->fs_info->nodesize, parent, + root->root_key.objectid, level, 0); + btrfs_free_path(path); + if (ret == -ENOENT) + return 0; + if (ret < 0) + return ret; + return 1; +} + +/* + * helper to process tree block pointer. + * + * when wc->stage == DROP_REFERENCE, this function checks + * reference count of the block pointed to. if the block + * is shared and we need update back refs for the subtree + * rooted at the block, this function changes wc->stage to + * UPDATE_BACKREF. if the block is shared and there is no + * need to update back, this function drops the reference + * to the block. + * + * NOTE: return value 1 means we should stop walking down. + */ +static noinline int do_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int *lookup_info) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 bytenr; + u64 generation; + u64 parent; + struct btrfs_tree_parent_check check = { 0 }; + struct btrfs_key key; + struct btrfs_ref ref = { 0 }; + struct extent_buffer *next; + int level = wc->level; + int reada = 0; + int ret = 0; + bool need_account = false; + + generation = btrfs_node_ptr_generation(path->nodes[level], + path->slots[level]); + /* + * if the lower level block was created before the snapshot + * was created, we know there is no need to update back refs + * for the subtree + */ + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) { + *lookup_info = 1; + return 1; + } + + bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); + + check.level = level - 1; + check.transid = generation; + check.owner_root = root->root_key.objectid; + check.has_first_key = true; + btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, + path->slots[level]); + + next = find_extent_buffer(fs_info, bytenr); + if (!next) { + next = btrfs_find_create_tree_block(fs_info, bytenr, + root->root_key.objectid, level - 1); + if (IS_ERR(next)) + return PTR_ERR(next); + reada = 1; + } + btrfs_tree_lock(next); + + ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, + &wc->refs[level - 1], + &wc->flags[level - 1]); + if (ret < 0) + goto out_unlock; + + if (unlikely(wc->refs[level - 1] == 0)) { + btrfs_err(fs_info, "Missing references."); + ret = -EIO; + goto out_unlock; + } + *lookup_info = 0; + + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level - 1] > 1) { + need_account = true; + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; + + if (!wc->update_ref || + generation <= root->root_key.offset) + goto skip; + + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); + if (ret < 0) + goto skip; + + wc->stage = UPDATE_BACKREF; + wc->shared_level = level - 1; + } + } else { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; + } + + if (!btrfs_buffer_uptodate(next, generation, 0)) { + btrfs_tree_unlock(next); + free_extent_buffer(next); + next = NULL; + *lookup_info = 1; + } + + if (!next) { + if (reada && level == 1) + reada_walk_down(trans, root, wc, path); + next = read_tree_block(fs_info, bytenr, &check); + if (IS_ERR(next)) { + return PTR_ERR(next); + } else if (!extent_buffer_uptodate(next)) { + free_extent_buffer(next); + return -EIO; + } + btrfs_tree_lock(next); + } + + level--; + ASSERT(level == btrfs_header_level(next)); + if (level != btrfs_header_level(next)) { + btrfs_err(root->fs_info, "mismatched level"); + ret = -EIO; + goto out_unlock; + } + path->nodes[level] = next; + path->slots[level] = 0; + path->locks[level] = BTRFS_WRITE_LOCK; + wc->level = level; + if (wc->level == 1) + wc->reada_slot = 0; + return 0; +skip: + wc->refs[level - 1] = 0; + wc->flags[level - 1] = 0; + if (wc->stage == DROP_REFERENCE) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + parent = path->nodes[level]->start; + } else { + ASSERT(root->root_key.objectid == + btrfs_header_owner(path->nodes[level])); + if (root->root_key.objectid != + btrfs_header_owner(path->nodes[level])) { + btrfs_err(root->fs_info, + "mismatched block owner"); + ret = -EIO; + goto out_unlock; + } + parent = 0; + } + + /* + * If we had a drop_progress we need to verify the refs are set + * as expected. If we find our ref then we know that from here + * on out everything should be correct, and we can clear the + * ->restarted flag. + */ + if (wc->restarted) { + ret = check_ref_exists(trans, root, bytenr, parent, + level - 1); + if (ret < 0) + goto out_unlock; + if (ret == 0) + goto no_delete; + ret = 0; + wc->restarted = 0; + } + + /* + * Reloc tree doesn't contribute to qgroup numbers, and we have + * already accounted them at merge time (replace_path), + * thus we could skip expensive subtree trace here. + */ + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + need_account) { + ret = btrfs_qgroup_trace_subtree(trans, next, + generation, level - 1); + if (ret) { + btrfs_err_rl(fs_info, + "Error %d accounting shared subtree. Quota is out of sync, rescan required.", + ret); + } + } + + /* + * We need to update the next key in our walk control so we can + * update the drop_progress key accordingly. We don't care if + * find_next_key doesn't find a key because that means we're at + * the end and are going to clean up now. + */ + wc->drop_level = level; + find_next_key(path, level, &wc->drop_progress); + + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, + fs_info->nodesize, parent); + btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, + 0, false); + ret = btrfs_free_extent(trans, &ref); + if (ret) + goto out_unlock; + } +no_delete: + *lookup_info = 1; + ret = 1; + +out_unlock: + btrfs_tree_unlock(next); + free_extent_buffer(next); + + return ret; +} + +/* + * helper to process tree block while walking up the tree. + * + * when wc->stage == DROP_REFERENCE, this function drops + * reference count on the block. + * + * when wc->stage == UPDATE_BACKREF, this function changes + * wc->stage back to DROP_REFERENCE if we changed wc->stage + * to UPDATE_BACKREF previously while processing the block. + * + * NOTE: return value 1 means we should stop walking up. + */ +static noinline int walk_up_proc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; + u64 parent = 0; + + if (wc->stage == UPDATE_BACKREF) { + BUG_ON(wc->shared_level < level); + if (level < wc->shared_level) + goto out; + + ret = find_next_key(path, level + 1, &wc->update_progress); + if (ret > 0) + wc->update_ref = 0; + + wc->stage = DROP_REFERENCE; + wc->shared_level = -1; + path->slots[level] = 0; + + /* + * check reference count again if the block isn't locked. + * we should start walking down the tree again if reference + * count is one. + */ + if (!path->locks[level]) { + BUG_ON(level == 0); + btrfs_tree_lock(eb); + path->locks[level] = BTRFS_WRITE_LOCK; + + ret = btrfs_lookup_extent_info(trans, fs_info, + eb->start, level, 1, + &wc->refs[level], + &wc->flags[level]); + if (ret < 0) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + return ret; + } + BUG_ON(wc->refs[level] == 0); + if (wc->refs[level] == 1) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + return 1; + } + } + } + + /* wc->stage == DROP_REFERENCE */ + BUG_ON(wc->refs[level] > 1 && !path->locks[level]); + + if (wc->refs[level] == 1) { + if (level == 0) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + ret = btrfs_dec_ref(trans, root, eb, 1); + else + ret = btrfs_dec_ref(trans, root, eb, 0); + BUG_ON(ret); /* -ENOMEM */ + if (is_fstree(root->root_key.objectid)) { + ret = btrfs_qgroup_trace_leaf_items(trans, eb); + if (ret) { + btrfs_err_rl(fs_info, + "error %d accounting leaf items, quota is out of sync, rescan required", + ret); + } + } + } + /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */ + if (!path->locks[level]) { + btrfs_tree_lock(eb); + path->locks[level] = BTRFS_WRITE_LOCK; + } + btrfs_clear_buffer_dirty(trans, eb); + } + + if (eb == root->node) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = eb->start; + else if (root->root_key.objectid != btrfs_header_owner(eb)) + goto owner_mismatch; + } else { + if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = path->nodes[level + 1]->start; + else if (root->root_key.objectid != + btrfs_header_owner(path->nodes[level + 1])) + goto owner_mismatch; + } + + btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent, + wc->refs[level] == 1); +out: + wc->refs[level] = 0; + wc->flags[level] = 0; + return 0; + +owner_mismatch: + btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", + btrfs_header_owner(eb), root->root_key.objectid); + return -EUCLEAN; +} + +static noinline int walk_down_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) +{ + int level = wc->level; + int lookup_info = 1; + int ret = 0; + + while (level >= 0) { + ret = walk_down_proc(trans, root, path, wc, lookup_info); + if (ret) + break; + + if (level == 0) + break; + + if (path->slots[level] >= + btrfs_header_nritems(path->nodes[level])) + break; + + ret = do_walk_down(trans, root, path, wc, &lookup_info); + if (ret > 0) { + path->slots[level]++; + continue; + } else if (ret < 0) + break; + level = wc->level; + } + return (ret == 1) ? 0 : ret; +} + +static noinline int walk_up_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int max_level) +{ + int level = wc->level; + int ret; + + path->slots[level] = btrfs_header_nritems(path->nodes[level]); + while (level < max_level && path->nodes[level]) { + wc->level = level; + if (path->slots[level] + 1 < + btrfs_header_nritems(path->nodes[level])) { + path->slots[level]++; + return 0; + } else { + ret = walk_up_proc(trans, root, path, wc); + if (ret > 0) + return 0; + if (ret < 0) + return ret; + + if (path->locks[level]) { + btrfs_tree_unlock_rw(path->nodes[level], + path->locks[level]); + path->locks[level] = 0; + } + free_extent_buffer(path->nodes[level]); + path->nodes[level] = NULL; + level++; + } + } + return 1; +} + +/* + * drop a subvolume tree. + * + * this function traverses the tree freeing any blocks that only + * referenced by the tree. + * + * when a shared tree block is found. this function decreases its + * reference count by one. if update_ref is true, this function + * also make sure backrefs for the shared block and all lower level + * blocks are properly updated. + * + * If called with for_reloc == 0, may exit early with -EAGAIN + */ +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) +{ + const bool is_reloc_root = (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID); + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root_item *root_item = &root->root_item; + struct walk_control *wc; + struct btrfs_key key; + int err = 0; + int ret; + int level; + bool root_dropped = false; + bool unfinished_drop = false; + + btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); + + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } + + wc = kzalloc(sizeof(*wc), GFP_NOFS); + if (!wc) { + btrfs_free_path(path); + err = -ENOMEM; + goto out; + } + + /* + * Use join to avoid potential EINTR from transaction start. See + * wait_reserve_ticket and the whole reservation callchain. + */ + if (for_reloc) + trans = btrfs_join_transaction(tree_root); + else + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } + + err = btrfs_run_delayed_items(trans); + if (err) + goto out_end_trans; + + /* + * This will help us catch people modifying the fs tree while we're + * dropping it. It is unsafe to mess with the fs tree while it's being + * dropped as we unlock the root node and parent nodes as we walk down + * the tree, assuming nothing will change. If something does change + * then we'll have stale information and drop references to blocks we've + * already dropped. + */ + set_bit(BTRFS_ROOT_DELETING, &root->state); + unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); + + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + level = btrfs_header_level(root->node); + path->nodes[level] = btrfs_lock_root_node(root); + path->slots[level] = 0; + path->locks[level] = BTRFS_WRITE_LOCK; + memset(&wc->update_progress, 0, + sizeof(wc->update_progress)); + } else { + btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + memcpy(&wc->update_progress, &key, + sizeof(wc->update_progress)); + + level = btrfs_root_drop_level(root_item); + BUG_ON(level == 0); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + path->lowest_level = 0; + if (ret < 0) { + err = ret; + goto out_end_trans; + } + WARN_ON(ret > 0); + + /* + * unlock our path, this is safe because only this + * function is allowed to delete this snapshot + */ + btrfs_unlock_up_safe(path, 0); + + level = btrfs_header_level(root->node); + while (1) { + btrfs_tree_lock(path->nodes[level]); + path->locks[level] = BTRFS_WRITE_LOCK; + + ret = btrfs_lookup_extent_info(trans, fs_info, + path->nodes[level]->start, + level, 1, &wc->refs[level], + &wc->flags[level]); + if (ret < 0) { + err = ret; + goto out_end_trans; + } + BUG_ON(wc->refs[level] == 0); + + if (level == btrfs_root_drop_level(root_item)) + break; + + btrfs_tree_unlock(path->nodes[level]); + path->locks[level] = 0; + WARN_ON(wc->refs[level] != 1); + level--; + } + } + + wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state); + wc->level = level; + wc->shared_level = -1; + wc->stage = DROP_REFERENCE; + wc->update_ref = update_ref; + wc->keep_locks = 0; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); + + while (1) { + + ret = walk_down_tree(trans, root, path, wc); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + err = ret; + break; + } + + ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + err = ret; + break; + } + + if (ret > 0) { + BUG_ON(wc->stage != DROP_REFERENCE); + break; + } + + if (wc->stage == DROP_REFERENCE) { + wc->drop_level = wc->level; + btrfs_node_key_to_cpu(path->nodes[wc->drop_level], + &wc->drop_progress, + path->slots[wc->drop_level]); + } + btrfs_cpu_key_to_disk(&root_item->drop_progress, + &wc->drop_progress); + btrfs_set_root_drop_level(root_item, wc->drop_level); + + BUG_ON(wc->level == 0); + if (btrfs_should_end_transaction(trans) || + (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + root_item); + if (ret) { + btrfs_abort_transaction(trans, ret); + err = ret; + goto out_end_trans; + } + + if (!is_reloc_root) + btrfs_set_last_root_drop_gen(fs_info, trans->transid); + + btrfs_end_transaction_throttle(trans); + if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { + btrfs_debug(fs_info, + "drop snapshot early exit"); + err = -EAGAIN; + goto out_free; + } + + /* + * Use join to avoid potential EINTR from transaction + * start. See wait_reserve_ticket and the whole + * reservation callchain. + */ + if (for_reloc) + trans = btrfs_join_transaction(tree_root); + else + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } + } + } + btrfs_release_path(path); + if (err) + goto out_end_trans; + + ret = btrfs_del_root(trans, &root->root_key); + if (ret) { + btrfs_abort_transaction(trans, ret); + err = ret; + goto out_end_trans; + } + + if (!is_reloc_root) { + ret = btrfs_find_root(tree_root, &root->root_key, path, + NULL, NULL); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + err = ret; + goto out_end_trans; + } else if (ret > 0) { + /* if we fail to delete the orphan item this time + * around, it'll get picked up the next time. + * + * The most common failure here is just -ENOENT. + */ + btrfs_del_orphan_item(trans, tree_root, + root->root_key.objectid); + } + } + + /* + * This subvolume is going to be completely dropped, and won't be + * recorded as dirty roots, thus pertrans meta rsv will not be freed at + * commit transaction time. So free it here manually. + */ + btrfs_qgroup_convert_reserved_meta(root, INT_MAX); + btrfs_qgroup_free_meta_all_pertrans(root); + + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) + btrfs_add_dropped_root(trans, root); + else + btrfs_put_root(root); + root_dropped = true; +out_end_trans: + if (!is_reloc_root) + btrfs_set_last_root_drop_gen(fs_info, trans->transid); + + btrfs_end_transaction_throttle(trans); +out_free: + kfree(wc); + btrfs_free_path(path); +out: + /* + * We were an unfinished drop root, check to see if there are any + * pending, and if not clear and wake up any waiters. + */ + if (!err && unfinished_drop) + btrfs_maybe_wake_unfinished_drop(fs_info); + + /* + * So if we need to stop dropping the snapshot for whatever reason we + * need to make sure to add it back to the dead root list so that we + * keep trying to do the work later. This also cleans up roots if we + * don't have it in the radix (like when we recover after a power fail + * or unmount) so we don't leak memory. + */ + if (!for_reloc && !root_dropped) + btrfs_add_dead_root(root); + return err; +} + +/* + * drop subtree rooted at tree block 'node'. + * + * NOTE: this function will unlock and release tree block 'node' + * only used by relocation code + */ +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + struct walk_control *wc; + int level; + int parent_level; + int ret = 0; + int wret; + + BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + wc = kzalloc(sizeof(*wc), GFP_NOFS); + if (!wc) { + btrfs_free_path(path); + return -ENOMEM; + } + + btrfs_assert_tree_write_locked(parent); + parent_level = btrfs_header_level(parent); + atomic_inc(&parent->refs); + path->nodes[parent_level] = parent; + path->slots[parent_level] = btrfs_header_nritems(parent); + + btrfs_assert_tree_write_locked(node); + level = btrfs_header_level(node); + path->nodes[level] = node; + path->slots[level] = 0; + path->locks[level] = BTRFS_WRITE_LOCK; + + wc->refs[parent_level] = 1; + wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; + wc->level = level; + wc->shared_level = -1; + wc->stage = DROP_REFERENCE; + wc->update_ref = 0; + wc->keep_locks = 1; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); + + while (1) { + wret = walk_down_tree(trans, root, path, wc); + if (wret < 0) { + ret = wret; + break; + } + + wret = walk_up_tree(trans, root, path, wc, parent_level); + if (wret < 0) + ret = wret; + if (wret != 0) + break; + } + + kfree(wc); + btrfs_free_path(path); + return ret; +} + +int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, + u64 start, u64 end) +{ + return unpin_extent_range(fs_info, start, end, false); +} + +/* + * It used to be that old block groups would be left around forever. + * Iterating over them would be enough to trim unused space. Since we + * now automatically remove them, we also need to iterate over unallocated + * space. + * + * We don't want a transaction for this since the discard may take a + * substantial amount of time. We don't require that a transaction be + * running, but we do need to take a running transaction into account + * to ensure that we're not discarding chunks that were released or + * allocated in the current transaction. + * + * Holding the chunks lock will prevent other threads from allocating + * or releasing chunks, but it won't prevent a running transaction + * from committing and releasing the memory that the pending chunks + * list head uses. For that, we need to take a reference to the + * transaction and hold the commit root sem. We only need to hold + * it while performing the free space search since we have already + * held back allocations. + */ +static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) +{ + u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0; + int ret; + + *trimmed = 0; + + /* Discard not supported = nothing to do. */ + if (!bdev_max_discard_sectors(device->bdev)) + return 0; + + /* Not writable = nothing to do. */ + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + return 0; + + /* No free space = nothing to do. */ + if (device->total_bytes <= device->bytes_used) + return 0; + + ret = 0; + + while (1) { + struct btrfs_fs_info *fs_info = device->fs_info; + u64 bytes; + + ret = mutex_lock_interruptible(&fs_info->chunk_mutex); + if (ret) + break; + + find_first_clear_extent_bit(&device->alloc_state, start, + &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + /* Check if there are any CHUNK_* bits left */ + if (start > device->total_bytes) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_warn_in_rcu(fs_info, +"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", + start, end - start + 1, + btrfs_dev_name(device), + device->total_bytes); + mutex_unlock(&fs_info->chunk_mutex); + ret = 0; + break; + } + + /* Ensure we skip the reserved space on each device. */ + start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); + + /* + * If find_first_clear_extent_bit find a range that spans the + * end of the device it will set end to -1, in this case it's up + * to the caller to trim the value to the size of the device. + */ + end = min(end, device->total_bytes - 1); + + len = end - start + 1; + + /* We didn't find any extents */ + if (!len) { + mutex_unlock(&fs_info->chunk_mutex); + ret = 0; + break; + } + + ret = btrfs_issue_discard(device->bdev, start, len, + &bytes); + if (!ret) + set_extent_bit(&device->alloc_state, start, + start + bytes - 1, CHUNK_TRIMMED, NULL); + mutex_unlock(&fs_info->chunk_mutex); + + if (ret) + break; + + start += len; + *trimmed += bytes; + + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } + + return ret; +} + +/* + * Trim the whole filesystem by: + * 1) trimming the free space in each block group + * 2) trimming the unallocated space on each device + * + * This will also continue trimming even if a block group or device encounters + * an error. The return value will be the last error, or 0 if nothing bad + * happens. + */ +int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_block_group *cache = NULL; + struct btrfs_device *device; + u64 group_trimmed; + u64 range_end = U64_MAX; + u64 start; + u64 end; + u64 trimmed = 0; + u64 bg_failed = 0; + u64 dev_failed = 0; + int bg_ret = 0; + int dev_ret = 0; + int ret = 0; + + if (range->start == U64_MAX) + return -EINVAL; + + /* + * Check range overflow if range->len is set. + * The default range->len is U64_MAX. + */ + if (range->len != U64_MAX && + check_add_overflow(range->start, range->len, &range_end)) + return -EINVAL; + + cache = btrfs_lookup_first_block_group(fs_info, range->start); + for (; cache; cache = btrfs_next_block_group(cache)) { + if (cache->start >= range_end) { + btrfs_put_block_group(cache); + break; + } + + start = max(range->start, cache->start); + end = min(range_end, cache->start + cache->length); + + if (end - start >= range->minlen) { + if (!btrfs_block_group_done(cache)) { + ret = btrfs_cache_block_group(cache, true); + if (ret) { + bg_failed++; + bg_ret = ret; + continue; + } + } + ret = btrfs_trim_block_group(cache, + &group_trimmed, + start, + end, + range->minlen); + + trimmed += group_trimmed; + if (ret) { + bg_failed++; + bg_ret = ret; + continue; + } + } + } + + if (bg_failed) + btrfs_warn(fs_info, + "failed to trim %llu block group(s), last error %d", + bg_failed, bg_ret); + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) + continue; + + ret = btrfs_trim_free_extents(device, &group_trimmed); + if (ret) { + dev_failed++; + dev_ret = ret; + break; + } + + trimmed += group_trimmed; + } + mutex_unlock(&fs_devices->device_list_mutex); + + if (dev_failed) + btrfs_warn(fs_info, + "failed to trim %llu device(s), last error %d", + dev_failed, dev_ret); + range->len = trimmed; + if (bg_ret) + return bg_ret; + return dev_ret; +} diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h new file mode 100644 index 0000000000..88c249c375 --- /dev/null +++ b/fs/btrfs/extent-tree.h @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_EXTENT_TREE_H +#define BTRFS_EXTENT_TREE_H + +#include "misc.h" +#include "block-group.h" + +struct btrfs_free_cluster; + +enum btrfs_extent_allocation_policy { + BTRFS_EXTENT_ALLOC_CLUSTERED, + BTRFS_EXTENT_ALLOC_ZONED, +}; + +struct find_free_extent_ctl { + /* Basic allocation info */ + u64 ram_bytes; + u64 num_bytes; + u64 min_alloc_size; + u64 empty_size; + u64 flags; + int delalloc; + + /* Where to start the search inside the bg */ + u64 search_start; + + /* For clustered allocation */ + u64 empty_cluster; + struct btrfs_free_cluster *last_ptr; + bool use_cluster; + + bool have_caching_bg; + bool orig_have_caching_bg; + + /* Allocation is called for tree-log */ + bool for_treelog; + + /* Allocation is called for data relocation */ + bool for_data_reloc; + + /* RAID index, converted from flags */ + int index; + + /* + * Current loop number, check find_free_extent_update_loop() for details + */ + int loop; + + /* + * Set to true if we're retrying the allocation on this block group + * after waiting for caching progress, this is so that we retry only + * once before moving on to another block group. + */ + bool retry_uncached; + + /* If current block group is cached */ + int cached; + + /* Max contiguous hole found */ + u64 max_extent_size; + + /* Total free space from free space cache, not always contiguous */ + u64 total_free_space; + + /* Found result */ + u64 found_offset; + + /* Hint where to start looking for an empty space */ + u64 hint_byte; + + /* Allocation policy */ + enum btrfs_extent_allocation_policy policy; + + /* Whether or not the allocator is currently following a hint */ + bool hinted; + + /* Size class of block groups to prefer in early loops */ + enum btrfs_block_group_size_class size_class; +}; + +enum btrfs_inline_ref_type { + BTRFS_REF_TYPE_INVALID, + BTRFS_REF_TYPE_BLOCK, + BTRFS_REF_TYPE_DATA, + BTRFS_REF_TYPE_ANY, +}; + +int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, + struct btrfs_extent_inline_ref *iref, + enum btrfs_inline_ref_type is_data); +u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); + +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); +void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); +int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 offset, int metadata, u64 *refs, u64 *flags); +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, + int reserved); +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes); +int btrfs_exclude_logged_extents(struct extent_buffer *eb); +int btrfs_cross_ref_exist(struct btrfs_root *root, + u64 objectid, u64 offset, u64 bytenr, bool strict, + struct btrfs_path *path); +struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + const struct btrfs_disk_key *key, + int level, u64 hint, + u64 empty_size, + enum btrfs_lock_nesting nest); +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + u64 root_id, + struct extent_buffer *buf, + u64 parent, int last_ref); +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 owner, + u64 offset, u64 ram_bytes, + struct btrfs_key *ins); +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins); +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, + u64 min_alloc_size, u64 empty_size, u64 hint_byte, + struct btrfs_key *ins, int is_data, int delalloc); +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct extent_buffer *eb, u64 flags); +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); + +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 len, int delalloc); +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len); +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); +int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, + int for_reloc); +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent); + +#endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c new file mode 100644 index 0000000000..03c10e0ba0 --- /dev/null +++ b/fs/btrfs/extent_io.c @@ -0,0 +1,4676 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "misc.h" +#include "extent_io.h" +#include "extent-io-tree.h" +#include "extent_map.h" +#include "ctree.h" +#include "btrfs_inode.h" +#include "bio.h" +#include "check-integrity.h" +#include "locking.h" +#include "rcu-string.h" +#include "backref.h" +#include "disk-io.h" +#include "subpage.h" +#include "zoned.h" +#include "block-group.h" +#include "compression.h" +#include "fs.h" +#include "accessors.h" +#include "file-item.h" +#include "file.h" +#include "dev-replace.h" +#include "super.h" +#include "transaction.h" + +static struct kmem_cache *extent_buffer_cache; + +#ifdef CONFIG_BTRFS_DEBUG +static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + unsigned long flags; + + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); + list_add(&eb->leak_list, &fs_info->allocated_ebs); + spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); +} + +static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + unsigned long flags; + + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); + list_del(&eb->leak_list); + spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); +} + +void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) +{ + struct extent_buffer *eb; + unsigned long flags; + + /* + * If we didn't get into open_ctree our allocated_ebs will not be + * initialized, so just skip this. + */ + if (!fs_info->allocated_ebs.next) + return; + + WARN_ON(!list_empty(&fs_info->allocated_ebs)); + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); + while (!list_empty(&fs_info->allocated_ebs)) { + eb = list_first_entry(&fs_info->allocated_ebs, + struct extent_buffer, leak_list); + pr_err( + "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n", + eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, + btrfs_header_owner(eb)); + list_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); + } + spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); +} +#else +#define btrfs_leak_debug_add_eb(eb) do {} while (0) +#define btrfs_leak_debug_del_eb(eb) do {} while (0) +#endif + +/* + * Structure to record info about the bio being assembled, and other info like + * how many bytes are there before stripe/ordered extent boundary. + */ +struct btrfs_bio_ctrl { + struct btrfs_bio *bbio; + enum btrfs_compression_type compress_type; + u32 len_to_oe_boundary; + blk_opf_t opf; + btrfs_bio_end_io_t end_io_func; + struct writeback_control *wbc; +}; + +static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) +{ + struct btrfs_bio *bbio = bio_ctrl->bbio; + + if (!bbio) + return; + + /* Caller should ensure the bio has at least some range added */ + ASSERT(bbio->bio.bi_iter.bi_size); + + if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && + bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) + btrfs_submit_compressed_read(bbio); + else + btrfs_submit_bio(bbio, 0); + + /* The bbio is owned by the end_io handler now */ + bio_ctrl->bbio = NULL; +} + +/* + * Submit or fail the current bio in the bio_ctrl structure. + */ +static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) +{ + struct btrfs_bio *bbio = bio_ctrl->bbio; + + if (!bbio) + return; + + if (ret) { + ASSERT(ret < 0); + btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); + /* The bio is owned by the end_io handler now */ + bio_ctrl->bbio = NULL; + } else { + submit_one_bio(bio_ctrl); + } +} + +int __init extent_buffer_init_cachep(void) +{ + extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", + sizeof(struct extent_buffer), 0, + SLAB_MEM_SPREAD, NULL); + if (!extent_buffer_cache) + return -ENOMEM; + + return 0; +} + +void __cold extent_buffer_free_cachep(void) +{ + /* + * Make sure all delayed rcu free are flushed before we + * destroy caches. + */ + rcu_barrier(); + kmem_cache_destroy(extent_buffer_cache); +} + +void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + clear_page_dirty_for_io(page); + put_page(page); + index++; + } +} + +static void process_one_page(struct btrfs_fs_info *fs_info, + struct page *page, struct page *locked_page, + unsigned long page_ops, u64 start, u64 end) +{ + u32 len; + + ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); + len = end + 1 - start; + + if (page_ops & PAGE_SET_ORDERED) + btrfs_page_clamp_set_ordered(fs_info, page, start, len); + if (page_ops & PAGE_START_WRITEBACK) { + btrfs_page_clamp_clear_dirty(fs_info, page, start, len); + btrfs_page_clamp_set_writeback(fs_info, page, start, len); + } + if (page_ops & PAGE_END_WRITEBACK) + btrfs_page_clamp_clear_writeback(fs_info, page, start, len); + + if (page != locked_page && (page_ops & PAGE_UNLOCK)) + btrfs_page_end_writer_lock(fs_info, page, start, len); +} + +static void __process_pages_contig(struct address_space *mapping, + struct page *locked_page, u64 start, u64 end, + unsigned long page_ops) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + pgoff_t index = start_index; + struct folio_batch fbatch; + int i; + + folio_batch_init(&fbatch); + while (index <= end_index) { + int found_folios; + + found_folios = filemap_get_folios_contig(mapping, &index, + end_index, &fbatch); + for (i = 0; i < found_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + process_one_page(fs_info, &folio->page, locked_page, + page_ops, start, end); + } + folio_batch_release(&fbatch); + cond_resched(); + } +} + +static noinline void __unlock_for_delalloc(struct inode *inode, + struct page *locked_page, + u64 start, u64 end) +{ + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; + + ASSERT(locked_page); + if (index == locked_page->index && end_index == index) + return; + + __process_pages_contig(inode->i_mapping, locked_page, start, end, + PAGE_UNLOCK); +} + +static noinline int lock_delalloc_pages(struct inode *inode, + struct page *locked_page, + u64 start, + u64 end) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + pgoff_t index = start_index; + u64 processed_end = start; + struct folio_batch fbatch; + + if (index == locked_page->index && index == end_index) + return 0; + + folio_batch_init(&fbatch); + while (index <= end_index) { + unsigned int found_folios, i; + + found_folios = filemap_get_folios_contig(mapping, &index, + end_index, &fbatch); + if (found_folios == 0) + goto out; + + for (i = 0; i < found_folios; i++) { + struct page *page = &fbatch.folios[i]->page; + u32 len = end + 1 - start; + + if (page == locked_page) + continue; + + if (btrfs_page_start_writer_lock(fs_info, page, start, + len)) + goto out; + + if (!PageDirty(page) || page->mapping != mapping) { + btrfs_page_end_writer_lock(fs_info, page, start, + len); + goto out; + } + + processed_end = page_offset(page) + PAGE_SIZE - 1; + } + folio_batch_release(&fbatch); + cond_resched(); + } + + return 0; +out: + folio_batch_release(&fbatch); + if (processed_end > start) + __unlock_for_delalloc(inode, locked_page, start, processed_end); + return -EAGAIN; +} + +/* + * Find and lock a contiguous range of bytes in the file marked as delalloc, no + * more than @max_bytes. + * + * @start: The original start bytenr to search. + * Will store the extent range start bytenr. + * @end: The original end bytenr of the search range + * Will store the extent range end bytenr. + * + * Return true if we find a delalloc range which starts inside the original + * range, and @start/@end will store the delalloc range start/end. + * + * Return false if we can't find any delalloc range which starts inside the + * original range, and @start/@end will be the non-delalloc range start/end. + */ +EXPORT_FOR_TESTS +noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, + struct page *locked_page, u64 *start, + u64 *end) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + const u64 orig_start = *start; + const u64 orig_end = *end; + /* The sanity tests may not set a valid fs_info. */ + u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE; + u64 delalloc_start; + u64 delalloc_end; + bool found; + struct extent_state *cached_state = NULL; + int ret; + int loops = 0; + + /* Caller should pass a valid @end to indicate the search range end */ + ASSERT(orig_end > orig_start); + + /* The range should at least cover part of the page */ + ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE || + orig_end <= page_offset(locked_page))); +again: + /* step one, find a bunch of delalloc bytes starting at start */ + delalloc_start = *start; + delalloc_end = 0; + found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, + max_bytes, &cached_state); + if (!found || delalloc_end <= *start || delalloc_start > orig_end) { + *start = delalloc_start; + + /* @delalloc_end can be -1, never go beyond @orig_end */ + *end = min(delalloc_end, orig_end); + free_extent_state(cached_state); + return false; + } + + /* + * start comes from the offset of locked_page. We have to lock + * pages in order, so we can't process delalloc bytes before + * locked_page + */ + if (delalloc_start < *start) + delalloc_start = *start; + + /* + * make sure to limit the number of pages we try to lock down + */ + if (delalloc_end + 1 - delalloc_start > max_bytes) + delalloc_end = delalloc_start + max_bytes - 1; + + /* step two, lock all the pages after the page that has start */ + ret = lock_delalloc_pages(inode, locked_page, + delalloc_start, delalloc_end); + ASSERT(!ret || ret == -EAGAIN); + if (ret == -EAGAIN) { + /* some of the pages are gone, lets avoid looping by + * shortening the size of the delalloc range we're searching + */ + free_extent_state(cached_state); + cached_state = NULL; + if (!loops) { + max_bytes = PAGE_SIZE; + loops = 1; + goto again; + } else { + found = false; + goto out_failed; + } + } + + /* step three, lock the state bits for the whole range */ + lock_extent(tree, delalloc_start, delalloc_end, &cached_state); + + /* then test to make sure it is all still delalloc */ + ret = test_range_bit(tree, delalloc_start, delalloc_end, + EXTENT_DELALLOC, 1, cached_state); + if (!ret) { + unlock_extent(tree, delalloc_start, delalloc_end, + &cached_state); + __unlock_for_delalloc(inode, locked_page, + delalloc_start, delalloc_end); + cond_resched(); + goto again; + } + free_extent_state(cached_state); + *start = delalloc_start; + *end = delalloc_end; +out_failed: + return found; +} + +void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, + struct page *locked_page, + u32 clear_bits, unsigned long page_ops) +{ + clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL); + + __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, + start, end, page_ops); +} + +static bool btrfs_verify_page(struct page *page, u64 start) +{ + if (!fsverity_active(page->mapping->host) || + PageUptodate(page) || + start >= i_size_read(page->mapping->host)) + return true; + return fsverity_verify_page(page); +} + +static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + + ASSERT(page_offset(page) <= start && + start + len <= page_offset(page) + PAGE_SIZE); + + if (uptodate && btrfs_verify_page(page, start)) + btrfs_page_set_uptodate(fs_info, page, start, len); + else + btrfs_page_clear_uptodate(fs_info, page, start, len); + + if (!btrfs_is_subpage(fs_info, page)) + unlock_page(page); + else + btrfs_subpage_end_reader(fs_info, page, start, len); +} + +/* + * after a writepage IO is done, we need to: + * clear the uptodate bits on error + * clear the writeback bits in the extent tree for this IO + * end_page_writeback if the page has no more pending IO + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static void end_bio_extent_writepage(struct btrfs_bio *bbio) +{ + struct bio *bio = &bbio->bio; + int error = blk_status_to_errno(bio->bi_status); + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + ASSERT(!bio_flagged(bio, BIO_CLONED)); + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; + u64 start = page_offset(page) + bvec->bv_offset; + u32 len = bvec->bv_len; + + /* Our read/write should always be sector aligned. */ + if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) + btrfs_err(fs_info, + "partial page write in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else if (!IS_ALIGNED(bvec->bv_len, sectorsize)) + btrfs_info(fs_info, + "incomplete page write with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + + btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error); + if (error) + mapping_set_error(page->mapping, error); + btrfs_page_clear_writeback(fs_info, page, start, len); + } + + bio_put(bio); +} + +/* + * Record previously processed extent range + * + * For endio_readpage_release_extent() to handle a full extent range, reducing + * the extent io operations. + */ +struct processed_extent { + struct btrfs_inode *inode; + /* Start of the range in @inode */ + u64 start; + /* End of the range in @inode */ + u64 end; + bool uptodate; +}; + +/* + * Try to release processed extent range + * + * May not release the extent range right now if the current range is + * contiguous to processed extent. + * + * Will release processed extent when any of @inode, @uptodate, the range is + * no longer contiguous to the processed range. + * + * Passing @inode == NULL will force processed extent to be released. + */ +static void endio_readpage_release_extent(struct processed_extent *processed, + struct btrfs_inode *inode, u64 start, u64 end, + bool uptodate) +{ + struct extent_state *cached = NULL; + struct extent_io_tree *tree; + + /* The first extent, initialize @processed */ + if (!processed->inode) + goto update; + + /* + * Contiguous to processed extent, just uptodate the end. + * + * Several things to notice: + * + * - bio can be merged as long as on-disk bytenr is contiguous + * This means we can have page belonging to other inodes, thus need to + * check if the inode still matches. + * - bvec can contain range beyond current page for multi-page bvec + * Thus we need to do processed->end + 1 >= start check + */ + if (processed->inode == inode && processed->uptodate == uptodate && + processed->end + 1 >= start && end >= processed->end) { + processed->end = end; + return; + } + + tree = &processed->inode->io_tree; + /* + * Now we don't have range contiguous to the processed range, release + * the processed range now. + */ + unlock_extent(tree, processed->start, processed->end, &cached); + +update: + /* Update processed to current range */ + processed->inode = inode; + processed->start = start; + processed->end = end; + processed->uptodate = uptodate; +} + +static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) +{ + ASSERT(PageLocked(page)); + if (!btrfs_is_subpage(fs_info, page)) + return; + + ASSERT(PagePrivate(page)); + btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); +} + +/* + * after a readpage IO is done, we need to: + * clear the uptodate bits on error + * set the uptodate bits if things worked + * set the page up to date if all extents in the tree are uptodate + * clear the lock bit in the extent tree + * unlock the page if there are no other extents locked for it + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static void end_bio_extent_readpage(struct btrfs_bio *bbio) +{ + struct bio *bio = &bbio->bio; + struct bio_vec *bvec; + struct processed_extent processed = { 0 }; + /* + * The offset to the beginning of a bio, since one bio can never be + * larger than UINT_MAX, u32 here is enough. + */ + u32 bio_offset = 0; + struct bvec_iter_all iter_all; + + ASSERT(!bio_flagged(bio, BIO_CLONED)); + bio_for_each_segment_all(bvec, bio, iter_all) { + bool uptodate = !bio->bi_status; + struct page *page = bvec->bv_page; + struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; + u64 start; + u64 end; + u32 len; + + btrfs_debug(fs_info, + "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", + bio->bi_iter.bi_sector, bio->bi_status, + bbio->mirror_num); + + /* + * We always issue full-sector reads, but if some block in a + * page fails to read, blk_update_request() will advance + * bv_offset and adjust bv_len to compensate. Print a warning + * for unaligned offsets, and an error if they don't add up to + * a full sector. + */ + if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) + btrfs_err(fs_info, + "partial page read in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len, + sectorsize)) + btrfs_info(fs_info, + "incomplete page read with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + + start = page_offset(page) + bvec->bv_offset; + end = start + bvec->bv_len - 1; + len = bvec->bv_len; + + if (likely(uptodate)) { + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_SHIFT; + + /* + * Zero out the remaining part if this range straddles + * i_size. + * + * Here we should only zero the range inside the bvec, + * not touch anything else. + * + * NOTE: i_size is exclusive while end is inclusive. + */ + if (page->index == end_index && i_size <= end) { + u32 zero_start = max(offset_in_page(i_size), + offset_in_page(start)); + + zero_user_segment(page, zero_start, + offset_in_page(end) + 1); + } + } + + /* Update page status and unlock. */ + end_page_read(page, uptodate, start, len); + endio_readpage_release_extent(&processed, BTRFS_I(inode), + start, end, uptodate); + + ASSERT(bio_offset + len > bio_offset); + bio_offset += len; + + } + /* Release the last extent */ + endio_readpage_release_extent(&processed, NULL, 0, 0, false); + bio_put(bio); +} + +/* + * Populate every free slot in a provided array with pages. + * + * @nr_pages: number of pages to allocate + * @page_array: the array to fill with pages; any existing non-null entries in + * the array will be skipped + * + * Return: 0 if all pages were able to be allocated; + * -ENOMEM otherwise, the partially allocated pages would be freed and + * the array slots zeroed + */ +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) +{ + unsigned int allocated; + + for (allocated = 0; allocated < nr_pages;) { + unsigned int last = allocated; + + allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array); + + if (allocated == nr_pages) + return 0; + + /* + * During this iteration, no page could be allocated, even + * though alloc_pages_bulk_array() falls back to alloc_page() + * if it could not bulk-allocate. So we must be out of memory. + */ + if (allocated == last) { + for (int i = 0; i < allocated; i++) { + __free_page(page_array[i]); + page_array[i] = NULL; + } + return -ENOMEM; + } + + memalloc_retry_wait(GFP_NOFS); + } + return 0; +} + +static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, + struct page *page, u64 disk_bytenr, + unsigned int pg_offset) +{ + struct bio *bio = &bio_ctrl->bbio->bio; + struct bio_vec *bvec = bio_last_bvec_all(bio); + const sector_t sector = disk_bytenr >> SECTOR_SHIFT; + + if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { + /* + * For compression, all IO should have its logical bytenr set + * to the starting bytenr of the compressed extent. + */ + return bio->bi_iter.bi_sector == sector; + } + + /* + * The contig check requires the following conditions to be met: + * + * 1) The pages are belonging to the same inode + * This is implied by the call chain. + * + * 2) The range has adjacent logical bytenr + * + * 3) The range has adjacent file offset + * This is required for the usage of btrfs_bio->file_offset. + */ + return bio_end_sector(bio) == sector && + page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len == + page_offset(page) + pg_offset; +} + +static void alloc_new_bio(struct btrfs_inode *inode, + struct btrfs_bio_ctrl *bio_ctrl, + u64 disk_bytenr, u64 file_offset) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_bio *bbio; + + bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info, + bio_ctrl->end_io_func, NULL); + bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + bbio->inode = inode; + bbio->file_offset = file_offset; + bio_ctrl->bbio = bbio; + bio_ctrl->len_to_oe_boundary = U32_MAX; + + /* Limit data write bios to the ordered boundary. */ + if (bio_ctrl->wbc) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_ordered_extent(inode, file_offset); + if (ordered) { + bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, + ordered->file_offset + + ordered->disk_num_bytes - file_offset); + bbio->ordered = ordered; + } + + /* + * Pick the last added device to support cgroup writeback. For + * multi-device file systems this means blk-cgroup policies have + * to always be set on the last added/replaced device. + * This is a bit odd but has been like that for a long time. + */ + bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); + wbc_init_bio(bio_ctrl->wbc, &bbio->bio); + } +} + +/* + * @disk_bytenr: logical bytenr where the write will be + * @page: page to add to the bio + * @size: portion of page that we want to write to + * @pg_offset: offset of the new bio or to check whether we are adding + * a contiguous page to the previous one + * + * The will either add the page into the existing @bio_ctrl->bbio, or allocate a + * new one in @bio_ctrl->bbio. + * The mirror number for this IO should already be initizlied in + * @bio_ctrl->mirror_num. + */ +static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, + u64 disk_bytenr, struct page *page, + size_t size, unsigned long pg_offset) +{ + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + + ASSERT(pg_offset + size <= PAGE_SIZE); + ASSERT(bio_ctrl->end_io_func); + + if (bio_ctrl->bbio && + !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset)) + submit_one_bio(bio_ctrl); + + do { + u32 len = size; + + /* Allocate new bio if needed */ + if (!bio_ctrl->bbio) { + alloc_new_bio(inode, bio_ctrl, disk_bytenr, + page_offset(page) + pg_offset); + } + + /* Cap to the current ordered extent boundary if there is one. */ + if (len > bio_ctrl->len_to_oe_boundary) { + ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE); + ASSERT(is_data_inode(&inode->vfs_inode)); + len = bio_ctrl->len_to_oe_boundary; + } + + if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) { + /* bio full: move on to a new one */ + submit_one_bio(bio_ctrl); + continue; + } + + if (bio_ctrl->wbc) + wbc_account_cgroup_owner(bio_ctrl->wbc, page, len); + + size -= len; + pg_offset += len; + disk_bytenr += len; + + /* + * len_to_oe_boundary defaults to U32_MAX, which isn't page or + * sector aligned. alloc_new_bio() then sets it to the end of + * our ordered extent for writes into zoned devices. + * + * When len_to_oe_boundary is tracking an ordered extent, we + * trust the ordered extent code to align things properly, and + * the check above to cap our write to the ordered extent + * boundary is correct. + * + * When len_to_oe_boundary is U32_MAX, the cap above would + * result in a 4095 byte IO for the last page right before + * we hit the bio limit of UINT_MAX. bio_add_page() has all + * the checks required to make sure we don't overflow the bio, + * and we should just ignore len_to_oe_boundary completely + * unless we're using it to track an ordered extent. + * + * It's pretty hard to make a bio sized U32_MAX, but it can + * happen when the page cache is able to feed us contiguous + * pages for large extents. + */ + if (bio_ctrl->len_to_oe_boundary != U32_MAX) + bio_ctrl->len_to_oe_boundary -= len; + + /* Ordered extent boundary: move on to a new bio. */ + if (bio_ctrl->len_to_oe_boundary == 0) + submit_one_bio(bio_ctrl); + } while (size); +} + +static int attach_extent_buffer_page(struct extent_buffer *eb, + struct page *page, + struct btrfs_subpage *prealloc) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + int ret = 0; + + /* + * If the page is mapped to btree inode, we should hold the private + * lock to prevent race. + * For cloned or dummy extent buffers, their pages are not mapped and + * will not race with any other ebs. + */ + if (page->mapping) + lockdep_assert_held(&page->mapping->private_lock); + + if (fs_info->nodesize >= PAGE_SIZE) { + if (!PagePrivate(page)) + attach_page_private(page, eb); + else + WARN_ON(page->private != (unsigned long)eb); + return 0; + } + + /* Already mapped, just free prealloc */ + if (PagePrivate(page)) { + btrfs_free_subpage(prealloc); + return 0; + } + + if (prealloc) + /* Has preallocated memory for subpage */ + attach_page_private(page, prealloc); + else + /* Do new allocation to attach subpage */ + ret = btrfs_attach_subpage(fs_info, page, + BTRFS_SUBPAGE_METADATA); + return ret; +} + +int set_page_extent_mapped(struct page *page) +{ + struct btrfs_fs_info *fs_info; + + ASSERT(page->mapping); + + if (PagePrivate(page)) + return 0; + + fs_info = btrfs_sb(page->mapping->host->i_sb); + + if (btrfs_is_subpage(fs_info, page)) + return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); + + attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); + return 0; +} + +void clear_page_extent_mapped(struct page *page) +{ + struct btrfs_fs_info *fs_info; + + ASSERT(page->mapping); + + if (!PagePrivate(page)) + return; + + fs_info = btrfs_sb(page->mapping->host->i_sb); + if (btrfs_is_subpage(fs_info, page)) + return btrfs_detach_subpage(fs_info, page); + + detach_page_private(page); +} + +static struct extent_map * +__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, + u64 start, u64 len, struct extent_map **em_cached) +{ + struct extent_map *em; + + if (em_cached && *em_cached) { + em = *em_cached; + if (extent_map_in_tree(em) && start >= em->start && + start < extent_map_end(em)) { + refcount_inc(&em->refs); + return em; + } + + free_extent_map(em); + *em_cached = NULL; + } + + em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); + if (em_cached && !IS_ERR(em)) { + BUG_ON(*em_cached); + refcount_inc(&em->refs); + *em_cached = em; + } + return em; +} +/* + * basic readpage implementation. Locked extent state structs are inserted + * into the tree that are removed when the IO is done (by the end_io + * handlers) + * XXX JDM: This needs looking at to ensure proper page locking + * return 0 on success, otherwise return error + */ +static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) +{ + struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + u64 start = page_offset(page); + const u64 end = start + PAGE_SIZE - 1; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + struct extent_map *em; + int ret = 0; + size_t pg_offset = 0; + size_t iosize; + size_t blocksize = inode->i_sb->s_blocksize; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_extent(tree, start, end, NULL); + unlock_page(page); + return ret; + } + + if (page->index == last_byte >> PAGE_SHIFT) { + size_t zero_offset = offset_in_page(last_byte); + + if (zero_offset) { + iosize = PAGE_SIZE - zero_offset; + memzero_page(page, zero_offset, iosize); + } + } + bio_ctrl->end_io_func = end_bio_extent_readpage; + begin_page_read(fs_info, page); + while (cur <= end) { + enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; + bool force_bio_submit = false; + u64 disk_bytenr; + + ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); + if (cur >= last_byte) { + iosize = PAGE_SIZE - pg_offset; + memzero_page(page, pg_offset, iosize); + unlock_extent(tree, cur, cur + iosize - 1, NULL); + end_page_read(page, true, cur, iosize); + break; + } + em = __get_extent_map(inode, page, pg_offset, cur, + end - cur + 1, em_cached); + if (IS_ERR(em)) { + unlock_extent(tree, cur, end, NULL); + end_page_read(page, false, cur, end + 1 - cur); + return PTR_ERR(em); + } + extent_offset = cur - em->start; + BUG_ON(extent_map_end(em) <= cur); + BUG_ON(end < cur); + + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + compress_type = em->compress_type; + + iosize = min(extent_map_end(em) - cur, end - cur + 1); + iosize = ALIGN(iosize, blocksize); + if (compress_type != BTRFS_COMPRESS_NONE) + disk_bytenr = em->block_start; + else + disk_bytenr = em->block_start + extent_offset; + block_start = em->block_start; + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + block_start = EXTENT_MAP_HOLE; + + /* + * If we have a file range that points to a compressed extent + * and it's followed by a consecutive file range that points + * to the same compressed extent (possibly with a different + * offset and/or length, so it either points to the whole extent + * or only part of it), we must make sure we do not submit a + * single bio to populate the pages for the 2 ranges because + * this makes the compressed extent read zero out the pages + * belonging to the 2nd range. Imagine the following scenario: + * + * File layout + * [0 - 8K] [8K - 24K] + * | | + * | | + * points to extent X, points to extent X, + * offset 4K, length of 8K offset 0, length 16K + * + * [extent X, compressed length = 4K uncompressed length = 16K] + * + * If the bio to read the compressed extent covers both ranges, + * it will decompress extent X into the pages belonging to the + * first range and then it will stop, zeroing out the remaining + * pages that belong to the other range that points to extent X. + * So here we make sure we submit 2 bios, one for the first + * range and another one for the third range. Both will target + * the same physical extent from disk, but we can't currently + * make the compressed bio endio callback populate the pages + * for both ranges because each compressed bio is tightly + * coupled with a single extent map, and each range can have + * an extent map with a different offset value relative to the + * uncompressed data of our extent and different lengths. This + * is a corner case so we prioritize correctness over + * non-optimal behavior (submitting 2 bios for the same extent). + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && + prev_em_start && *prev_em_start != (u64)-1 && + *prev_em_start != em->start) + force_bio_submit = true; + + if (prev_em_start) + *prev_em_start = em->start; + + free_extent_map(em); + em = NULL; + + /* we've found a hole, just zero and go on */ + if (block_start == EXTENT_MAP_HOLE) { + memzero_page(page, pg_offset, iosize); + + unlock_extent(tree, cur, cur + iosize - 1, NULL); + end_page_read(page, true, cur, iosize); + cur = cur + iosize; + pg_offset += iosize; + continue; + } + /* the get_extent function already copied into the page */ + if (block_start == EXTENT_MAP_INLINE) { + unlock_extent(tree, cur, cur + iosize - 1, NULL); + end_page_read(page, true, cur, iosize); + cur = cur + iosize; + pg_offset += iosize; + continue; + } + + if (bio_ctrl->compress_type != compress_type) { + submit_one_bio(bio_ctrl); + bio_ctrl->compress_type = compress_type; + } + + if (force_bio_submit) + submit_one_bio(bio_ctrl); + submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, + pg_offset); + cur = cur + iosize; + pg_offset += iosize; + } + + return 0; +} + +int btrfs_read_folio(struct file *file, struct folio *folio) +{ + struct page *page = &folio->page; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + u64 start = page_offset(page); + u64 end = start + PAGE_SIZE - 1; + struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; + int ret; + + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + + ret = btrfs_do_readpage(page, NULL, &bio_ctrl, NULL); + /* + * If btrfs_do_readpage() failed we will want to submit the assembled + * bio to do the cleanup. + */ + submit_one_bio(&bio_ctrl); + return ret; +} + +static inline void contiguous_readpages(struct page *pages[], int nr_pages, + u64 start, u64 end, + struct extent_map **em_cached, + struct btrfs_bio_ctrl *bio_ctrl, + u64 *prev_em_start) +{ + struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); + int index; + + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + + for (index = 0; index < nr_pages; index++) { + btrfs_do_readpage(pages[index], em_cached, bio_ctrl, + prev_em_start); + put_page(pages[index]); + } +} + +/* + * helper for __extent_writepage, doing all of the delayed allocation setup. + * + * This returns 1 if btrfs_run_delalloc_range function did all the work required + * to write the page (copy into inline extent). In this case the IO has + * been started and the page is already unlocked. + * + * This returns 0 if all went well (page still locked) + * This returns < 0 if there were errors (page still locked) + */ +static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, + struct page *page, struct writeback_control *wbc) +{ + const u64 page_start = page_offset(page); + const u64 page_end = page_start + PAGE_SIZE - 1; + u64 delalloc_start = page_start; + u64 delalloc_end = page_end; + u64 delalloc_to_write = 0; + int ret = 0; + + while (delalloc_start < page_end) { + delalloc_end = page_end; + if (!find_lock_delalloc_range(&inode->vfs_inode, page, + &delalloc_start, &delalloc_end)) { + delalloc_start = delalloc_end + 1; + continue; + } + + ret = btrfs_run_delalloc_range(inode, page, delalloc_start, + delalloc_end, wbc); + if (ret < 0) + return ret; + + delalloc_start = delalloc_end + 1; + } + + /* + * delalloc_end is already one less than the total length, so + * we don't subtract one from PAGE_SIZE + */ + delalloc_to_write += + DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); + + /* + * If btrfs_run_dealloc_range() already started I/O and unlocked + * the pages, we just need to account for them here. + */ + if (ret == 1) { + wbc->nr_to_write -= delalloc_to_write; + return 1; + } + + if (wbc->nr_to_write < delalloc_to_write) { + int thresh = 8192; + + if (delalloc_to_write < thresh * 2) + thresh = delalloc_to_write; + wbc->nr_to_write = min_t(u64, delalloc_to_write, + thresh); + } + + return 0; +} + +/* + * Find the first byte we need to write. + * + * For subpage, one page can contain several sectors, and + * __extent_writepage_io() will just grab all extent maps in the page + * range and try to submit all non-inline/non-compressed extents. + * + * This is a big problem for subpage, we shouldn't re-submit already written + * data at all. + * This function will lookup subpage dirty bit to find which range we really + * need to submit. + * + * Return the next dirty range in [@start, @end). + * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE. + */ +static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, + struct page *page, u64 *start, u64 *end) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage_info *spi = fs_info->subpage_info; + u64 orig_start = *start; + /* Declare as unsigned long so we can use bitmap ops */ + unsigned long flags; + int range_start_bit; + int range_end_bit; + + /* + * For regular sector size == page size case, since one page only + * contains one sector, we return the page offset directly. + */ + if (!btrfs_is_subpage(fs_info, page)) { + *start = page_offset(page); + *end = page_offset(page) + PAGE_SIZE; + return; + } + + range_start_bit = spi->dirty_offset + + (offset_in_page(orig_start) >> fs_info->sectorsize_bits); + + /* We should have the page locked, but just in case */ + spin_lock_irqsave(&subpage->lock, flags); + bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit, + spi->dirty_offset + spi->bitmap_nr_bits); + spin_unlock_irqrestore(&subpage->lock, flags); + + range_start_bit -= spi->dirty_offset; + range_end_bit -= spi->dirty_offset; + + *start = page_offset(page) + range_start_bit * fs_info->sectorsize; + *end = page_offset(page) + range_end_bit * fs_info->sectorsize; +} + +/* + * helper for __extent_writepage. This calls the writepage start hooks, + * and does the loop to map the page into extents and bios. + * + * We return 1 if the IO is started and the page is unlocked, + * 0 if all went well (page still locked) + * < 0 if there were errors (page still locked) + */ +static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, + struct page *page, + struct btrfs_bio_ctrl *bio_ctrl, + loff_t i_size, + int *nr_ret) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 cur = page_offset(page); + u64 end = cur + PAGE_SIZE - 1; + u64 extent_offset; + u64 block_start; + struct extent_map *em; + int ret = 0; + int nr = 0; + + ret = btrfs_writepage_cow_fixup(page); + if (ret) { + /* Fixup worker will requeue */ + redirty_page_for_writepage(bio_ctrl->wbc, page); + unlock_page(page); + return 1; + } + + bio_ctrl->end_io_func = end_bio_extent_writepage; + while (cur <= end) { + u32 len = end - cur + 1; + u64 disk_bytenr; + u64 em_end; + u64 dirty_range_start = cur; + u64 dirty_range_end; + u32 iosize; + + if (cur >= i_size) { + btrfs_mark_ordered_io_finished(inode, page, cur, len, + true); + /* + * This range is beyond i_size, thus we don't need to + * bother writing back. + * But we still need to clear the dirty subpage bit, or + * the next time the page gets dirtied, we will try to + * writeback the sectors with subpage dirty bits, + * causing writeback without ordered extent. + */ + btrfs_page_clear_dirty(fs_info, page, cur, len); + break; + } + + find_next_dirty_byte(fs_info, page, &dirty_range_start, + &dirty_range_end); + if (cur < dirty_range_start) { + cur = dirty_range_start; + continue; + } + + em = btrfs_get_extent(inode, NULL, 0, cur, len); + if (IS_ERR(em)) { + ret = PTR_ERR_OR_ZERO(em); + goto out_error; + } + + extent_offset = cur - em->start; + em_end = extent_map_end(em); + ASSERT(cur <= em_end); + ASSERT(cur < end); + ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize)); + ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); + + block_start = em->block_start; + disk_bytenr = em->block_start + extent_offset; + + ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + ASSERT(block_start != EXTENT_MAP_HOLE); + ASSERT(block_start != EXTENT_MAP_INLINE); + + /* + * Note that em_end from extent_map_end() and dirty_range_end from + * find_next_dirty_byte() are all exclusive + */ + iosize = min(min(em_end, end + 1), dirty_range_end) - cur; + free_extent_map(em); + em = NULL; + + btrfs_set_range_writeback(inode, cur, cur + iosize - 1); + if (!PageWriteback(page)) { + btrfs_err(inode->root->fs_info, + "page %lu not writeback, cur %llu end %llu", + page->index, cur, end); + } + + /* + * Although the PageDirty bit is cleared before entering this + * function, subpage dirty bit is not cleared. + * So clear subpage dirty bit here so next time we won't submit + * page for range already written to disk. + */ + btrfs_page_clear_dirty(fs_info, page, cur, iosize); + + submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, + cur - page_offset(page)); + cur += iosize; + nr++; + } + + btrfs_page_assert_not_dirty(fs_info, page); + *nr_ret = nr; + return 0; + +out_error: + /* + * If we finish without problem, we should not only clear page dirty, + * but also empty subpage dirty bits + */ + *nr_ret = nr; + return ret; +} + +/* + * the writepage semantics are similar to regular writepage. extent + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback. Then the lock bits are removed + * and the end_io handler clears the writeback ranges + * + * Return 0 if everything goes well. + * Return <0 for error. + */ +static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) +{ + struct folio *folio = page_folio(page); + struct inode *inode = page->mapping->host; + const u64 page_start = page_offset(page); + int ret; + int nr = 0; + size_t pg_offset; + loff_t i_size = i_size_read(inode); + unsigned long end_index = i_size >> PAGE_SHIFT; + + trace___extent_writepage(page, inode, bio_ctrl->wbc); + + WARN_ON(!PageLocked(page)); + + pg_offset = offset_in_page(i_size); + if (page->index > end_index || + (page->index == end_index && !pg_offset)) { + folio_invalidate(folio, 0, folio_size(folio)); + folio_unlock(folio); + return 0; + } + + if (page->index == end_index) + memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); + + ret = set_page_extent_mapped(page); + if (ret < 0) + goto done; + + ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); + if (ret == 1) + return 0; + if (ret) + goto done; + + ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr); + if (ret == 1) + return 0; + + bio_ctrl->wbc->nr_to_write--; + +done: + if (nr == 0) { + /* make sure the mapping tag for page dirty gets cleared */ + set_page_writeback(page); + end_page_writeback(page); + } + if (ret) { + btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start, + PAGE_SIZE, !ret); + mapping_set_error(page->mapping, ret); + } + unlock_page(page); + ASSERT(ret <= 0); + return ret; +} + +void wait_on_extent_buffer_writeback(struct extent_buffer *eb) +{ + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, + TASK_UNINTERRUPTIBLE); +} + +/* + * Lock extent buffer status and pages for writeback. + * + * Return %false if the extent buffer doesn't need to be submitted (e.g. the + * extent buffer is not dirty) + * Return %true is the extent buffer is submitted to bio. + */ +static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb, + struct writeback_control *wbc) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + bool ret = false; + + btrfs_tree_lock(eb); + while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { + btrfs_tree_unlock(eb); + if (wbc->sync_mode != WB_SYNC_ALL) + return false; + wait_on_extent_buffer_writeback(eb); + btrfs_tree_lock(eb); + } + + /* + * We need to do this to prevent races in people who check if the eb is + * under IO since we can end up having no IO bits set for a short period + * of time. + */ + spin_lock(&eb->refs_lock); + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + spin_unlock(&eb->refs_lock); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + -eb->len, + fs_info->dirty_metadata_batch); + ret = true; + } else { + spin_unlock(&eb->refs_lock); + } + btrfs_tree_unlock(eb); + return ret; +} + +static void set_btree_ioerr(struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + + set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); + + /* + * A read may stumble upon this buffer later, make sure that it gets an + * error and knows there was an error. + */ + clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + + /* + * We need to set the mapping with the io error as well because a write + * error will flip the file system readonly, and then syncfs() will + * return a 0 because we are readonly if we don't modify the err seq for + * the superblock. + */ + mapping_set_error(eb->fs_info->btree_inode->i_mapping, -EIO); + + /* + * If writeback for a btree extent that doesn't belong to a log tree + * failed, increment the counter transaction->eb_write_errors. + * We do this because while the transaction is running and before it's + * committing (when we call filemap_fdata[write|wait]_range against + * the btree inode), we might have + * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it + * returns an error or an error happens during writeback, when we're + * committing the transaction we wouldn't know about it, since the pages + * can be no longer dirty nor marked anymore for writeback (if a + * subsequent modification to the extent buffer didn't happen before the + * transaction commit), which makes filemap_fdata[write|wait]_range not + * able to find the pages tagged with SetPageError at transaction + * commit time. So if this happens we must abort the transaction, + * otherwise we commit a super block with btree roots that point to + * btree nodes/leafs whose content on disk is invalid - either garbage + * or the content of some node/leaf from a past generation that got + * cowed or deleted and is no longer valid. + * + * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would + * not be enough - we need to distinguish between log tree extents vs + * non-log tree extents, and the next filemap_fdatawait_range() call + * will catch and clear such errors in the mapping - and that call might + * be from a log sync and not from a transaction commit. Also, checking + * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is + * not done and would not be reliable - the eb might have been released + * from memory and reading it back again means that flag would not be + * set (since it's a runtime flag, not persisted on disk). + * + * Using the flags below in the btree inode also makes us achieve the + * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started + * writeback for all dirty pages and before filemap_fdatawait_range() + * is called, the writeback for all dirty pages had already finished + * with errors - because we were not using AS_EIO/AS_ENOSPC, + * filemap_fdatawait_range() would return success, as it could not know + * that writeback errors happened (the pages were no longer tagged for + * writeback). + */ + switch (eb->log_index) { + case -1: + set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags); + break; + case 0: + set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); + break; + case 1: + set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); + break; + default: + BUG(); /* unexpected, logic error */ + } +} + +/* + * The endio specific version which won't touch any unsafe spinlock in endio + * context. + */ +static struct extent_buffer *find_extent_buffer_nolock( + struct btrfs_fs_info *fs_info, u64 start) +{ + struct extent_buffer *eb; + + rcu_read_lock(); + eb = radix_tree_lookup(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + return eb; + } + rcu_read_unlock(); + return NULL; +} + +static void extent_buffer_write_end_io(struct btrfs_bio *bbio) +{ + struct extent_buffer *eb = bbio->private; + struct btrfs_fs_info *fs_info = eb->fs_info; + bool uptodate = !bbio->bio.bi_status; + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + u32 bio_offset = 0; + + if (!uptodate) + set_btree_ioerr(eb); + + bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + u64 start = eb->start + bio_offset; + struct page *page = bvec->bv_page; + u32 len = bvec->bv_len; + + btrfs_page_clear_writeback(fs_info, page, start, len); + bio_offset += len; + } + + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); + + bio_put(&bbio->bio); +} + +static void prepare_eb_write(struct extent_buffer *eb) +{ + u32 nritems; + unsigned long start; + unsigned long end; + + clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); + + /* Set btree blocks beyond nritems with 0 to avoid stale content */ + nritems = btrfs_header_nritems(eb); + if (btrfs_header_level(eb) > 0) { + end = btrfs_node_key_ptr_offset(eb, nritems); + memzero_extent_buffer(eb, end, eb->len - end); + } else { + /* + * Leaf: + * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 + */ + start = btrfs_item_nr_offset(eb, nritems); + end = btrfs_item_nr_offset(eb, 0); + if (nritems == 0) + end += BTRFS_LEAF_DATA_SIZE(eb->fs_info); + else + end += btrfs_item_offset(eb, nritems - 1); + memzero_extent_buffer(eb, start, end - start); + } +} + +static noinline_for_stack void write_one_eb(struct extent_buffer *eb, + struct writeback_control *wbc) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_bio *bbio; + + prepare_eb_write(eb); + + bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, + REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), + eb->fs_info, extent_buffer_write_end_io, eb); + bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; + bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); + wbc_init_bio(wbc, &bbio->bio); + bbio->inode = BTRFS_I(eb->fs_info->btree_inode); + bbio->file_offset = eb->start; + if (fs_info->nodesize < PAGE_SIZE) { + struct page *p = eb->pages[0]; + + lock_page(p); + btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len); + if (btrfs_subpage_clear_and_test_dirty(fs_info, p, eb->start, + eb->len)) { + clear_page_dirty_for_io(p); + wbc->nr_to_write--; + } + __bio_add_page(&bbio->bio, p, eb->len, eb->start - page_offset(p)); + wbc_account_cgroup_owner(wbc, p, eb->len); + unlock_page(p); + } else { + for (int i = 0; i < num_extent_pages(eb); i++) { + struct page *p = eb->pages[i]; + + lock_page(p); + clear_page_dirty_for_io(p); + set_page_writeback(p); + __bio_add_page(&bbio->bio, p, PAGE_SIZE, 0); + wbc_account_cgroup_owner(wbc, p, PAGE_SIZE); + wbc->nr_to_write--; + unlock_page(p); + } + } + btrfs_submit_bio(bbio, 0); +} + +/* + * Submit one subpage btree page. + * + * The main difference to submit_eb_page() is: + * - Page locking + * For subpage, we don't rely on page locking at all. + * + * - Flush write bio + * We only flush bio if we may be unable to fit current extent buffers into + * current bio. + * + * Return >=0 for the number of submitted extent buffers. + * Return <0 for fatal error. + */ +static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + int submitted = 0; + u64 page_start = page_offset(page); + int bit_start = 0; + int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; + + /* Lock and write each dirty extent buffers in the range */ + while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct extent_buffer *eb; + unsigned long flags; + u64 start; + + /* + * Take private lock to ensure the subpage won't be detached + * in the meantime. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&page->mapping->private_lock); + break; + } + spin_lock_irqsave(&subpage->lock, flags); + if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, + subpage->bitmaps)) { + spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock(&page->mapping->private_lock); + bit_start++; + continue; + } + + start = page_start + bit_start * fs_info->sectorsize; + bit_start += sectors_per_node; + + /* + * Here we just want to grab the eb without touching extra + * spin locks, so call find_extent_buffer_nolock(). + */ + eb = find_extent_buffer_nolock(fs_info, start); + spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock(&page->mapping->private_lock); + + /* + * The eb has already reached 0 refs thus find_extent_buffer() + * doesn't return it. We don't need to write back such eb + * anyway. + */ + if (!eb) + continue; + + if (lock_extent_buffer_for_io(eb, wbc)) { + write_one_eb(eb, wbc); + submitted++; + } + free_extent_buffer(eb); + } + return submitted; +} + +/* + * Submit all page(s) of one extent buffer. + * + * @page: the page of one extent buffer + * @eb_context: to determine if we need to submit this page, if current page + * belongs to this eb, we don't need to submit + * + * The caller should pass each page in their bytenr order, and here we use + * @eb_context to determine if we have submitted pages of one extent buffer. + * + * If we have, we just skip until we hit a new page that doesn't belong to + * current @eb_context. + * + * If not, we submit all the page(s) of the extent buffer. + * + * Return >0 if we have submitted the extent buffer successfully. + * Return 0 if we don't need to submit the page, as it's already submitted by + * previous call. + * Return <0 for fatal error. + */ +static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) +{ + struct writeback_control *wbc = ctx->wbc; + struct address_space *mapping = page->mapping; + struct extent_buffer *eb; + int ret; + + if (!PagePrivate(page)) + return 0; + + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) + return submit_eb_subpage(page, wbc); + + spin_lock(&mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&mapping->private_lock); + return 0; + } + + eb = (struct extent_buffer *)page->private; + + /* + * Shouldn't happen and normally this would be a BUG_ON but no point + * crashing the machine for something we can survive anyway. + */ + if (WARN_ON(!eb)) { + spin_unlock(&mapping->private_lock); + return 0; + } + + if (eb == ctx->eb) { + spin_unlock(&mapping->private_lock); + return 0; + } + ret = atomic_inc_not_zero(&eb->refs); + spin_unlock(&mapping->private_lock); + if (!ret) + return 0; + + ctx->eb = eb; + + ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx); + if (ret) { + if (ret == -EBUSY) + ret = 0; + free_extent_buffer(eb); + return ret; + } + + if (!lock_extent_buffer_for_io(eb, wbc)) { + free_extent_buffer(eb); + return 0; + } + /* Implies write in zoned mode. */ + if (ctx->zoned_bg) { + /* Mark the last eb in the block group. */ + btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb); + ctx->zoned_bg->meta_write_pointer += eb->len; + } + write_one_eb(eb, wbc); + free_extent_buffer(eb); + return 1; +} + +int btree_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct btrfs_eb_write_context ctx = { .wbc = wbc }; + struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; + int ret = 0; + int done = 0; + int nr_to_write_done = 0; + struct folio_batch fbatch; + unsigned int nr_folios; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + xa_mark_t tag; + + folio_batch_init(&fbatch); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + /* + * Start from the beginning does not need to cycle over the + * range, mark it as scanned. + */ + scanned = (index == 0); + } else { + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; + scanned = 1; + } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; + btrfs_zoned_meta_io_lock(fs_info); +retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); + while (!done && !nr_to_write_done && (index <= end) && + (nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch))) { + unsigned i; + + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + ret = submit_eb_page(&folio->page, &ctx); + if (ret == 0) + continue; + if (ret < 0) { + done = 1; + break; + } + + /* + * the filesystem may choose to bump up nr_to_write. + * We have to make sure to honor the new nr_to_write + * at any time + */ + nr_to_write_done = wbc->nr_to_write <= 0; + } + folio_batch_release(&fbatch); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + /* + * If something went wrong, don't allow any metadata write bio to be + * submitted. + * + * This would prevent use-after-free if we had dirty pages not + * cleaned up, which can still happen by fuzzed images. + * + * - Bad extent tree + * Allowing existing tree block to be allocated for other trees. + * + * - Log tree operations + * Exiting tree blocks get allocated to log tree, bumps its + * generation, then get cleaned in tree re-balance. + * Such tree block will not be written back, since it's clean, + * thus no WRITTEN flag set. + * And after log writes back, this tree block is not traced by + * any dirty extent_io_tree. + * + * - Offending tree block gets re-dirtied from its original owner + * Since it has bumped generation, no WRITTEN flag, it can be + * reused without COWing. This tree block will not be traced + * by btrfs_transaction::dirty_pages. + * + * Now such dirty tree block will not be cleaned by any dirty + * extent io tree. Thus we don't want to submit such wild eb + * if the fs already has error. + * + * We can get ret > 0 from submit_extent_page() indicating how many ebs + * were submitted. Reset it to 0 to avoid false alerts for the caller. + */ + if (ret > 0) + ret = 0; + if (!ret && BTRFS_FS_ERROR(fs_info)) + ret = -EROFS; + + if (ctx.zoned_bg) + btrfs_put_block_group(ctx.zoned_bg); + btrfs_zoned_meta_io_unlock(fs_info); + return ret; +} + +/* + * Walk the list of dirty pages of the given address space and write all of them. + * + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @bio_ctrl: holds context for the write, namely the bio + * + * If a page is already under I/O, write_cache_pages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + */ +static int extent_write_cache_pages(struct address_space *mapping, + struct btrfs_bio_ctrl *bio_ctrl) +{ + struct writeback_control *wbc = bio_ctrl->wbc; + struct inode *inode = mapping->host; + int ret = 0; + int done = 0; + int nr_to_write_done = 0; + struct folio_batch fbatch; + unsigned int nr_folios; + pgoff_t index; + pgoff_t end; /* Inclusive */ + pgoff_t done_index; + int range_whole = 0; + int scanned = 0; + xa_mark_t tag; + + /* + * We have to hold onto the inode so that ordered extents can do their + * work when the IO finishes. The alternative to this is failing to add + * an ordered extent if the igrab() fails there and that is a huge pain + * to deal with, so instead just hold onto the inode throughout the + * writepages operation. If it fails here we are freeing up the inode + * anyway and we'd rather not waste our time writing out stuff that is + * going to be truncated anyway. + */ + if (!igrab(inode)) + return 0; + + folio_batch_init(&fbatch); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + /* + * Start from the beginning does not need to cycle over the + * range, mark it as scanned. + */ + scanned = (index == 0); + } else { + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + scanned = 1; + } + + /* + * We do the tagged writepage as long as the snapshot flush bit is set + * and we are the first one who do the filemap_flush() on this inode. + * + * The nr_to_write == LONG_MAX is needed to make sure other flushers do + * not race in and drop the bit. + */ + if (range_whole && wbc->nr_to_write == LONG_MAX && + test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, + &BTRFS_I(inode)->runtime_flags)) + wbc->tagged_writepages = 1; + + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; +retry: + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + done_index = index; + while (!done && !nr_to_write_done && (index <= end) && + (nr_folios = filemap_get_folios_tag(mapping, &index, + end, tag, &fbatch))) { + unsigned i; + + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + done_index = folio_next_index(folio); + /* + * At this point we hold neither the i_pages lock nor + * the page lock: the page may be truncated or + * invalidated (changing page->mapping to NULL), + * or even swizzled back from swapper_space to + * tmpfs file mapping + */ + if (!folio_trylock(folio)) { + submit_write_bio(bio_ctrl, 0); + folio_lock(folio); + } + + if (unlikely(folio->mapping != mapping)) { + folio_unlock(folio); + continue; + } + + if (!folio_test_dirty(folio)) { + /* Someone wrote it for us. */ + folio_unlock(folio); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) { + if (folio_test_writeback(folio)) + submit_write_bio(bio_ctrl, 0); + folio_wait_writeback(folio); + } + + if (folio_test_writeback(folio) || + !folio_clear_dirty_for_io(folio)) { + folio_unlock(folio); + continue; + } + + ret = __extent_writepage(&folio->page, bio_ctrl); + if (ret < 0) { + done = 1; + break; + } + + /* + * The filesystem may choose to bump up nr_to_write. + * We have to make sure to honor the new nr_to_write + * at any time. + */ + nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE && + wbc->nr_to_write <= 0); + } + folio_batch_release(&fbatch); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + + /* + * If we're looping we could run into a page that is locked by a + * writer and that writer could be waiting on writeback for a + * page in our current bio, and thus deadlock, so flush the + * write bio here. + */ + submit_write_bio(bio_ctrl, 0); + goto retry; + } + + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) + mapping->writeback_index = done_index; + + btrfs_add_delayed_iput(BTRFS_I(inode)); + return ret; +} + +/* + * Submit the pages in the range to bio for call sites which delalloc range has + * already been ran (aka, ordered extent inserted) and all pages are still + * locked. + */ +void extent_write_locked_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, struct writeback_control *wbc, + bool pages_dirty) +{ + bool found_error = false; + int ret = 0; + struct address_space *mapping = inode->i_mapping; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; + loff_t i_size = i_size_read(inode); + u64 cur = start; + struct btrfs_bio_ctrl bio_ctrl = { + .wbc = wbc, + .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), + }; + + if (wbc->no_cgroup_owner) + bio_ctrl.opf |= REQ_BTRFS_CGROUP_PUNT; + + ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); + + while (cur <= end) { + u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + u32 cur_len = cur_end + 1 - cur; + struct page *page; + int nr = 0; + + page = find_get_page(mapping, cur >> PAGE_SHIFT); + ASSERT(PageLocked(page)); + if (pages_dirty && page != locked_page) { + ASSERT(PageDirty(page)); + clear_page_dirty_for_io(page); + } + + ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl, + i_size, &nr); + if (ret == 1) + goto next_page; + + /* Make sure the mapping tag for page dirty gets cleared. */ + if (nr == 0) { + set_page_writeback(page); + end_page_writeback(page); + } + if (ret) { + btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, + cur, cur_len, !ret); + mapping_set_error(page->mapping, ret); + } + btrfs_page_unlock_writer(fs_info, page, cur, cur_len); + if (ret < 0) + found_error = true; +next_page: + put_page(page); + cur = cur_end + 1; + } + + submit_write_bio(&bio_ctrl, found_error ? ret : 0); +} + +int extent_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + int ret = 0; + struct btrfs_bio_ctrl bio_ctrl = { + .wbc = wbc, + .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), + }; + + /* + * Allow only a single thread to do the reloc work in zoned mode to + * protect the write pointer updates. + */ + btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); + ret = extent_write_cache_pages(mapping, &bio_ctrl); + submit_write_bio(&bio_ctrl, ret); + btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); + return ret; +} + +void extent_readahead(struct readahead_control *rac) +{ + struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; + struct page *pagepool[16]; + struct extent_map *em_cached = NULL; + u64 prev_em_start = (u64)-1; + int nr; + + while ((nr = readahead_page_batch(rac, pagepool))) { + u64 contig_start = readahead_pos(rac); + u64 contig_end = contig_start + readahead_batch_length(rac) - 1; + + contiguous_readpages(pagepool, nr, contig_start, contig_end, + &em_cached, &bio_ctrl, &prev_em_start); + } + + if (em_cached) + free_extent_map(em_cached); + submit_one_bio(&bio_ctrl); +} + +/* + * basic invalidate_folio code, this waits on any locked or writeback + * ranges corresponding to the folio, and then deletes any extent state + * records from the tree + */ +int extent_invalidate_folio(struct extent_io_tree *tree, + struct folio *folio, size_t offset) +{ + struct extent_state *cached_state = NULL; + u64 start = folio_pos(folio); + u64 end = start + folio_size(folio) - 1; + size_t blocksize = folio->mapping->host->i_sb->s_blocksize; + + /* This function is only called for the btree inode */ + ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); + + start += ALIGN(offset, blocksize); + if (start > end) + return 0; + + lock_extent(tree, start, end, &cached_state); + folio_wait_writeback(folio); + + /* + * Currently for btree io tree, only EXTENT_LOCKED is utilized, + * so here we only need to unlock the extent range to free any + * existing extent state. + */ + unlock_extent(tree, start, end, &cached_state); + return 0; +} + +/* + * a helper for release_folio, this tests for areas of the page that + * are locked or under IO and drops the related state bits if it is safe + * to drop the page. + */ +static int try_release_extent_state(struct extent_io_tree *tree, + struct page *page, gfp_t mask) +{ + u64 start = page_offset(page); + u64 end = start + PAGE_SIZE - 1; + int ret = 1; + + if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { + ret = 0; + } else { + u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | + EXTENT_DELALLOC_NEW | EXTENT_CTLBITS | + EXTENT_QGROUP_RESERVED); + + /* + * At this point we can safely clear everything except the + * locked bit, the nodatasum bit and the delalloc new bit. + * The delalloc new bit will be cleared by ordered extent + * completion. + */ + ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL); + + /* if clear_extent_bit failed for enomem reasons, + * we can't allow the release to continue. + */ + if (ret < 0) + ret = 0; + else + ret = 1; + } + return ret; +} + +/* + * a helper for release_folio. As long as there are no locked extents + * in the range corresponding to the page, both state records and extent + * map records are removed + */ +int try_release_extent_mapping(struct page *page, gfp_t mask) +{ + struct extent_map *em; + u64 start = page_offset(page); + u64 end = start + PAGE_SIZE - 1; + struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host); + struct extent_io_tree *tree = &btrfs_inode->io_tree; + struct extent_map_tree *map = &btrfs_inode->extent_tree; + + if (gfpflags_allow_blocking(mask) && + page->mapping->host->i_size > SZ_16M) { + u64 len; + while (start <= end) { + struct btrfs_fs_info *fs_info; + u64 cur_gen; + + len = end - start + 1; + write_lock(&map->lock); + em = lookup_extent_mapping(map, start, len); + if (!em) { + write_unlock(&map->lock); + break; + } + if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || + em->start != start) { + write_unlock(&map->lock); + free_extent_map(em); + break; + } + if (test_range_bit(tree, em->start, + extent_map_end(em) - 1, + EXTENT_LOCKED, 0, NULL)) + goto next; + /* + * If it's not in the list of modified extents, used + * by a fast fsync, we can remove it. If it's being + * logged we can safely remove it since fsync took an + * extra reference on the em. + */ + if (list_empty(&em->list) || + test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + goto remove_em; + /* + * If it's in the list of modified extents, remove it + * only if its generation is older then the current one, + * in which case we don't need it for a fast fsync. + * Otherwise don't remove it, we could be racing with an + * ongoing fast fsync that could miss the new extent. + */ + fs_info = btrfs_inode->root->fs_info; + spin_lock(&fs_info->trans_lock); + cur_gen = fs_info->generation; + spin_unlock(&fs_info->trans_lock); + if (em->generation >= cur_gen) + goto next; +remove_em: + /* + * We only remove extent maps that are not in the list of + * modified extents or that are in the list but with a + * generation lower then the current generation, so there + * is no need to set the full fsync flag on the inode (it + * hurts the fsync performance for workloads with a data + * size that exceeds or is close to the system's memory). + */ + remove_extent_mapping(map, em); + /* once for the rb tree */ + free_extent_map(em); +next: + start = extent_map_end(em); + write_unlock(&map->lock); + + /* once for us */ + free_extent_map(em); + + cond_resched(); /* Allow large-extent preemption. */ + } + } + return try_release_extent_state(tree, page, mask); +} + +/* + * To cache previous fiemap extent + * + * Will be used for merging fiemap extent + */ +struct fiemap_cache { + u64 offset; + u64 phys; + u64 len; + u32 flags; + bool cached; +}; + +/* + * Helper to submit fiemap extent. + * + * Will try to merge current fiemap extent specified by @offset, @phys, + * @len and @flags with cached one. + * And only when we fails to merge, cached one will be submitted as + * fiemap extent. + * + * Return value is the same as fiemap_fill_next_extent(). + */ +static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache, + u64 offset, u64 phys, u64 len, u32 flags) +{ + int ret = 0; + + /* Set at the end of extent_fiemap(). */ + ASSERT((flags & FIEMAP_EXTENT_LAST) == 0); + + if (!cache->cached) + goto assign; + + /* + * Sanity check, extent_fiemap() should have ensured that new + * fiemap extent won't overlap with cached one. + * Not recoverable. + * + * NOTE: Physical address can overlap, due to compression + */ + if (cache->offset + cache->len > offset) { + WARN_ON(1); + return -EINVAL; + } + + /* + * Only merges fiemap extents if + * 1) Their logical addresses are continuous + * + * 2) Their physical addresses are continuous + * So truly compressed (physical size smaller than logical size) + * extents won't get merged with each other + * + * 3) Share same flags + */ + if (cache->offset + cache->len == offset && + cache->phys + cache->len == phys && + cache->flags == flags) { + cache->len += len; + return 0; + } + + /* Not mergeable, need to submit cached one */ + ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, + cache->len, cache->flags); + cache->cached = false; + if (ret) + return ret; +assign: + cache->cached = true; + cache->offset = offset; + cache->phys = phys; + cache->len = len; + cache->flags = flags; + + return 0; +} + +/* + * Emit last fiemap cache + * + * The last fiemap cache may still be cached in the following case: + * 0 4k 8k + * |<- Fiemap range ->| + * |<------------ First extent ----------->| + * + * In this case, the first extent range will be cached but not emitted. + * So we must emit it before ending extent_fiemap(). + */ +static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache) +{ + int ret; + + if (!cache->cached) + return 0; + + ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, + cache->len, cache->flags); + cache->cached = false; + if (ret > 0) + ret = 0; + return ret; +} + +static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path) +{ + struct extent_buffer *clone; + struct btrfs_key key; + int slot; + int ret; + + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) + return 0; + + ret = btrfs_next_leaf(inode->root, path); + if (ret != 0) + return ret; + + /* + * Don't bother with cloning if there are no more file extent items for + * our inode. + */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) + return 1; + + /* See the comment at fiemap_search_slot() about why we clone. */ + clone = btrfs_clone_extent_buffer(path->nodes[0]); + if (!clone) + return -ENOMEM; + + slot = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = clone; + path->slots[0] = slot; + + return 0; +} + +/* + * Search for the first file extent item that starts at a given file offset or + * the one that starts immediately before that offset. + * Returns: 0 on success, < 0 on error, 1 if not found. + */ +static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path, + u64 file_offset) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct extent_buffer *clone; + struct btrfs_key key; + int slot; + int ret; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = file_offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret != 0) + return ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + return 1; + } + + /* + * We clone the leaf and use it during fiemap. This is because while + * using the leaf we do expensive things like checking if an extent is + * shared, which can take a long time. In order to prevent blocking + * other tasks for too long, we use a clone of the leaf. We have locked + * the file range in the inode's io tree, so we know none of our file + * extent items can change. This way we avoid blocking other tasks that + * want to insert items for other inodes in the same leaf or b+tree + * rebalance operations (triggered for example when someone is trying + * to push items into this leaf when trying to insert an item in a + * neighbour leaf). + * We also need the private clone because holding a read lock on an + * extent buffer of the subvolume's b+tree will make lockdep unhappy + * when we call fiemap_fill_next_extent(), because that may cause a page + * fault when filling the user space buffer with fiemap data. + */ + clone = btrfs_clone_extent_buffer(path->nodes[0]); + if (!clone) + return -ENOMEM; + + slot = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = clone; + path->slots[0] = slot; + + return 0; +} + +/* + * Process a range which is a hole or a prealloc extent in the inode's subvolume + * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc + * extent. The end offset (@end) is inclusive. + */ +static int fiemap_process_hole(struct btrfs_inode *inode, + struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache, + struct extent_state **delalloc_cached_state, + struct btrfs_backref_share_check_ctx *backref_ctx, + u64 disk_bytenr, u64 extent_offset, + u64 extent_gen, + u64 start, u64 end) +{ + const u64 i_size = i_size_read(&inode->vfs_inode); + u64 cur_offset = start; + u64 last_delalloc_end = 0; + u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN; + bool checked_extent_shared = false; + int ret; + + /* + * There can be no delalloc past i_size, so don't waste time looking for + * it beyond i_size. + */ + while (cur_offset < end && cur_offset < i_size) { + u64 delalloc_start; + u64 delalloc_end; + u64 prealloc_start; + u64 prealloc_len = 0; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, + delalloc_cached_state, + &delalloc_start, + &delalloc_end); + if (!delalloc) + break; + + /* + * If this is a prealloc extent we have to report every section + * of it that has no delalloc. + */ + if (disk_bytenr != 0) { + if (last_delalloc_end == 0) { + prealloc_start = start; + prealloc_len = delalloc_start - start; + } else { + prealloc_start = last_delalloc_end + 1; + prealloc_len = delalloc_start - prealloc_start; + } + } + + if (prealloc_len > 0) { + if (!checked_extent_shared && fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, + extent_gen, + backref_ctx); + if (ret < 0) + return ret; + else if (ret > 0) + prealloc_flags |= FIEMAP_EXTENT_SHARED; + + checked_extent_shared = true; + } + ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, + disk_bytenr + extent_offset, + prealloc_len, prealloc_flags); + if (ret) + return ret; + extent_offset += prealloc_len; + } + + ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0, + delalloc_end + 1 - delalloc_start, + FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + if (ret) + return ret; + + last_delalloc_end = delalloc_end; + cur_offset = delalloc_end + 1; + extent_offset += cur_offset - delalloc_start; + cond_resched(); + } + + /* + * Either we found no delalloc for the whole prealloc extent or we have + * a prealloc extent that spans i_size or starts at or after i_size. + */ + if (disk_bytenr != 0 && last_delalloc_end < end) { + u64 prealloc_start; + u64 prealloc_len; + + if (last_delalloc_end == 0) { + prealloc_start = start; + prealloc_len = end + 1 - start; + } else { + prealloc_start = last_delalloc_end + 1; + prealloc_len = end + 1 - prealloc_start; + } + + if (!checked_extent_shared && fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, + extent_gen, + backref_ctx); + if (ret < 0) + return ret; + else if (ret > 0) + prealloc_flags |= FIEMAP_EXTENT_SHARED; + } + ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, + disk_bytenr + extent_offset, + prealloc_len, prealloc_flags); + if (ret) + return ret; + } + + return 0; +} + +static int fiemap_find_last_extent_offset(struct btrfs_inode *inode, + struct btrfs_path *path, + u64 *last_extent_end_ret) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; + u64 disk_bytenr; + int ret; + + /* + * Lookup the last file extent. We're not using i_size here because + * there might be preallocation past i_size. + */ + ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0); + /* There can't be a file extent item at offset (u64)-1 */ + ASSERT(ret != 0); + if (ret < 0) + return ret; + + /* + * For a non-existing key, btrfs_search_slot() always leaves us at a + * slot > 0, except if the btree is empty, which is impossible because + * at least it has the inode item for this inode and all the items for + * the root inode 256. + */ + ASSERT(path->slots[0] > 0); + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { + /* No file extent items in the subvolume tree. */ + *last_extent_end_ret = 0; + return 0; + } + + /* + * For an inline extent, the disk_bytenr is where inline data starts at, + * so first check if we have an inline extent item before checking if we + * have an implicit hole (disk_bytenr == 0). + */ + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { + *last_extent_end_ret = btrfs_file_extent_end(path); + return 0; + } + + /* + * Find the last file extent item that is not a hole (when NO_HOLES is + * not enabled). This should take at most 2 iterations in the worst + * case: we have one hole file extent item at slot 0 of a leaf and + * another hole file extent item as the last item in the previous leaf. + * This is because we merge file extent items that represent holes. + */ + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + while (disk_bytenr == 0) { + ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); + if (ret < 0) { + return ret; + } else if (ret > 0) { + /* No file extent items that are not holes. */ + *last_extent_end_ret = 0; + return 0; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + } + + *last_extent_end_ret = btrfs_file_extent_end(path); + return 0; +} + +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + const u64 ino = btrfs_ino(inode); + struct extent_state *cached_state = NULL; + struct extent_state *delalloc_cached_state = NULL; + struct btrfs_path *path; + struct fiemap_cache cache = { 0 }; + struct btrfs_backref_share_check_ctx *backref_ctx; + u64 last_extent_end; + u64 prev_extent_end; + u64 lockstart; + u64 lockend; + bool stopped = false; + int ret; + + backref_ctx = btrfs_alloc_backref_share_check_ctx(); + path = btrfs_alloc_path(); + if (!backref_ctx || !path) { + ret = -ENOMEM; + goto out; + } + + lockstart = round_down(start, inode->root->fs_info->sectorsize); + lockend = round_up(start + len, inode->root->fs_info->sectorsize); + prev_extent_end = lockstart; + + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); + lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + + ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); + if (ret < 0) + goto out_unlock; + btrfs_release_path(path); + + path->reada = READA_FORWARD; + ret = fiemap_search_slot(inode, path, lockstart); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* + * No file extent item found, but we may have delalloc between + * the current offset and i_size. So check for that. + */ + ret = 0; + goto check_eof_delalloc; + } + + while (prev_extent_end < lockend) { + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; + u64 extent_end; + u64 extent_len; + u64 extent_offset = 0; + u64 extent_gen; + u64 disk_bytenr = 0; + u64 flags = 0; + int extent_type; + u8 compression; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + break; + + extent_end = btrfs_file_extent_end(path); + + /* + * The first iteration can leave us at an extent item that ends + * before our range's start. Move to the next item. + */ + if (extent_end <= lockstart) + goto next_item; + + backref_ctx->curr_leaf_bytenr = leaf->start; + + /* We have in implicit hole (NO_HOLES feature enabled). */ + if (prev_extent_end < key.offset) { + const u64 range_end = min(key.offset, lockend) - 1; + + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, + backref_ctx, 0, 0, 0, + prev_extent_end, range_end); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* fiemap_fill_next_extent() told us to stop. */ + stopped = true; + break; + } + + /* We've reached the end of the fiemap range, stop. */ + if (key.offset >= lockend) { + stopped = true; + break; + } + } + + extent_len = extent_end - key.offset; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + compression = btrfs_file_extent_compression(leaf, ei); + extent_type = btrfs_file_extent_type(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (compression == BTRFS_COMPRESS_NONE) + extent_offset = btrfs_file_extent_offset(leaf, ei); + } + + if (compression != BTRFS_COMPRESS_NONE) + flags |= FIEMAP_EXTENT_ENCODED; + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + flags |= FIEMAP_EXTENT_DATA_INLINE; + flags |= FIEMAP_EXTENT_NOT_ALIGNED; + ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0, + extent_len, flags); + } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, + backref_ctx, + disk_bytenr, extent_offset, + extent_gen, key.offset, + extent_end - 1); + } else if (disk_bytenr == 0) { + /* We have an explicit hole. */ + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, + backref_ctx, 0, 0, 0, + key.offset, extent_end - 1); + } else { + /* We have a regular extent. */ + if (fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, + extent_gen, + backref_ctx); + if (ret < 0) + goto out_unlock; + else if (ret > 0) + flags |= FIEMAP_EXTENT_SHARED; + } + + ret = emit_fiemap_extent(fieinfo, &cache, key.offset, + disk_bytenr + extent_offset, + extent_len, flags); + } + + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* fiemap_fill_next_extent() told us to stop. */ + stopped = true; + break; + } + + prev_extent_end = extent_end; +next_item: + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out_unlock; + } + + ret = fiemap_next_leaf_item(inode, path); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* No more file extent items for this inode. */ + break; + } + cond_resched(); + } + +check_eof_delalloc: + /* + * Release (and free) the path before emitting any final entries to + * fiemap_fill_next_extent() to keep lockdep happy. This is because + * once we find no more file extent items exist, we may have a + * non-cloned leaf, and fiemap_fill_next_extent() can trigger page + * faults when copying data to the user space buffer. + */ + btrfs_free_path(path); + path = NULL; + + if (!stopped && prev_extent_end < lockend) { + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, backref_ctx, + 0, 0, 0, prev_extent_end, lockend - 1); + if (ret < 0) + goto out_unlock; + prev_extent_end = lockend; + } + + if (cache.cached && cache.offset + cache.len >= last_extent_end) { + const u64 i_size = i_size_read(&inode->vfs_inode); + + if (prev_extent_end < i_size) { + u64 delalloc_start; + u64 delalloc_end; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, + prev_extent_end, + i_size - 1, + &delalloc_cached_state, + &delalloc_start, + &delalloc_end); + if (!delalloc) + cache.flags |= FIEMAP_EXTENT_LAST; + } else { + cache.flags |= FIEMAP_EXTENT_LAST; + } + } + + ret = emit_last_fiemap_cache(fieinfo, &cache); + +out_unlock: + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); +out: + free_extent_state(delalloc_cached_state); + btrfs_free_backref_share_ctx(backref_ctx); + btrfs_free_path(path); + return ret; +} + +static void __free_extent_buffer(struct extent_buffer *eb) +{ + kmem_cache_free(extent_buffer_cache, eb); +} + +static int extent_buffer_under_io(const struct extent_buffer *eb) +{ + return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || + test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); +} + +static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) +{ + struct btrfs_subpage *subpage; + + lockdep_assert_held(&page->mapping->private_lock); + + if (PagePrivate(page)) { + subpage = (struct btrfs_subpage *)page->private; + if (atomic_read(&subpage->eb_refs)) + return true; + /* + * Even there is no eb refs here, we may still have + * end_page_read() call relying on page::private. + */ + if (atomic_read(&subpage->readers)) + return true; + } + return false; +} + +static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); + + /* + * For mapped eb, we're going to change the page private, which should + * be done under the private_lock. + */ + if (mapped) + spin_lock(&page->mapping->private_lock); + + if (!PagePrivate(page)) { + if (mapped) + spin_unlock(&page->mapping->private_lock); + return; + } + + if (fs_info->nodesize >= PAGE_SIZE) { + /* + * We do this since we'll remove the pages after we've + * removed the eb from the radix tree, so we could race + * and have this page now attached to the new eb. So + * only clear page_private if it's still connected to + * this eb. + */ + if (PagePrivate(page) && + page->private == (unsigned long)eb) { + BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(PageDirty(page)); + BUG_ON(PageWriteback(page)); + /* + * We need to make sure we haven't be attached + * to a new eb. + */ + detach_page_private(page); + } + if (mapped) + spin_unlock(&page->mapping->private_lock); + return; + } + + /* + * For subpage, we can have dummy eb with page private. In this case, + * we can directly detach the private as such page is only attached to + * one dummy eb, no sharing. + */ + if (!mapped) { + btrfs_detach_subpage(fs_info, page); + return; + } + + btrfs_page_dec_eb_refs(fs_info, page); + + /* + * We can only detach the page private if there are no other ebs in the + * page range and no unfinished IO. + */ + if (!page_range_has_eb(fs_info, page)) + btrfs_detach_subpage(fs_info, page); + + spin_unlock(&page->mapping->private_lock); +} + +/* Release all pages attached to the extent buffer */ +static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) +{ + int i; + int num_pages; + + ASSERT(!extent_buffer_under_io(eb)); + + num_pages = num_extent_pages(eb); + for (i = 0; i < num_pages; i++) { + struct page *page = eb->pages[i]; + + if (!page) + continue; + + detach_extent_buffer_page(eb, page); + + /* One for when we allocated the page */ + put_page(page); + } +} + +/* + * Helper for releasing the extent buffer. + */ +static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) +{ + btrfs_release_extent_buffer_pages(eb); + btrfs_leak_debug_del_eb(eb); + __free_extent_buffer(eb); +} + +static struct extent_buffer * +__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, + unsigned long len) +{ + struct extent_buffer *eb = NULL; + + eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); + eb->start = start; + eb->len = len; + eb->fs_info = fs_info; + init_rwsem(&eb->lock); + + btrfs_leak_debug_add_eb(eb); + + spin_lock_init(&eb->refs_lock); + atomic_set(&eb->refs, 1); + + ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); + + return eb; +} + +struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) +{ + int i; + struct extent_buffer *new; + int num_pages = num_extent_pages(src); + int ret; + + new = __alloc_extent_buffer(src->fs_info, src->start, src->len); + if (new == NULL) + return NULL; + + /* + * Set UNMAPPED before calling btrfs_release_extent_buffer(), as + * btrfs_release_extent_buffer() have different behavior for + * UNMAPPED subpage extent buffer. + */ + set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); + + ret = btrfs_alloc_page_array(num_pages, new->pages); + if (ret) { + btrfs_release_extent_buffer(new); + return NULL; + } + + for (i = 0; i < num_pages; i++) { + int ret; + struct page *p = new->pages[i]; + + ret = attach_extent_buffer_page(new, p, NULL); + if (ret < 0) { + btrfs_release_extent_buffer(new); + return NULL; + } + WARN_ON(PageDirty(p)); + } + copy_extent_buffer_full(new, src); + set_extent_buffer_uptodate(new); + + return new; +} + +struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len) +{ + struct extent_buffer *eb; + int num_pages; + int i; + int ret; + + eb = __alloc_extent_buffer(fs_info, start, len); + if (!eb) + return NULL; + + num_pages = num_extent_pages(eb); + ret = btrfs_alloc_page_array(num_pages, eb->pages); + if (ret) + goto err; + + for (i = 0; i < num_pages; i++) { + struct page *p = eb->pages[i]; + + ret = attach_extent_buffer_page(eb, p, NULL); + if (ret < 0) + goto err; + } + + set_extent_buffer_uptodate(eb); + btrfs_set_header_nritems(eb, 0); + set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); + + return eb; +err: + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) { + detach_extent_buffer_page(eb, eb->pages[i]); + __free_page(eb->pages[i]); + } + } + __free_extent_buffer(eb); + return NULL; +} + +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) +{ + return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); +} + +static void check_buffer_tree_ref(struct extent_buffer *eb) +{ + int refs; + /* + * The TREE_REF bit is first set when the extent_buffer is added + * to the radix tree. It is also reset, if unset, when a new reference + * is created by find_extent_buffer. + * + * It is only cleared in two cases: freeing the last non-tree + * reference to the extent_buffer when its STALE bit is set or + * calling release_folio when the tree reference is the only reference. + * + * In both cases, care is taken to ensure that the extent_buffer's + * pages are not under io. However, release_folio can be concurrently + * called with creating new references, which is prone to race + * conditions between the calls to check_buffer_tree_ref in those + * codepaths and clearing TREE_REF in try_release_extent_buffer. + * + * The actual lifetime of the extent_buffer in the radix tree is + * adequately protected by the refcount, but the TREE_REF bit and + * its corresponding reference are not. To protect against this + * class of races, we call check_buffer_tree_ref from the codepaths + * which trigger io. Note that once io is initiated, TREE_REF can no + * longer be cleared, so that is the moment at which any such race is + * best fixed. + */ + refs = atomic_read(&eb->refs); + if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + return; + + spin_lock(&eb->refs_lock); + if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_inc(&eb->refs); + spin_unlock(&eb->refs_lock); +} + +static void mark_extent_buffer_accessed(struct extent_buffer *eb, + struct page *accessed) +{ + int num_pages, i; + + check_buffer_tree_ref(eb); + + num_pages = num_extent_pages(eb); + for (i = 0; i < num_pages; i++) { + struct page *p = eb->pages[i]; + + if (p != accessed) + mark_page_accessed(p); + } +} + +struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) +{ + struct extent_buffer *eb; + + eb = find_extent_buffer_nolock(fs_info, start); + if (!eb) + return NULL; + /* + * Lock our eb's refs_lock to avoid races with free_extent_buffer(). + * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and + * another task running free_extent_buffer() might have seen that flag + * set, eb->refs == 2, that the buffer isn't under IO (dirty and + * writeback flags not set) and it's still in the tree (flag + * EXTENT_BUFFER_TREE_REF set), therefore being in the process of + * decrementing the extent buffer's reference count twice. So here we + * could race and increment the eb's reference count, clear its stale + * flag, mark it as dirty and drop our reference before the other task + * finishes executing free_extent_buffer, which would later result in + * an attempt to free an extent buffer that is dirty. + */ + if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { + spin_lock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); + } + mark_extent_buffer_accessed(eb, NULL); + return eb; +} + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) +{ + struct extent_buffer *eb, *exists = NULL; + int ret; + + eb = find_extent_buffer(fs_info, start); + if (eb) + return eb; + eb = alloc_dummy_extent_buffer(fs_info, start); + if (!eb) + return ERR_PTR(-ENOMEM); + eb->fs_info = fs_info; +again: + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + exists = ERR_PTR(ret); + goto free_eb; + } + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + else + goto again; + } + check_buffer_tree_ref(eb); + set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); + + return eb; +free_eb: + btrfs_release_extent_buffer(eb); + return exists; +} +#endif + +static struct extent_buffer *grab_extent_buffer( + struct btrfs_fs_info *fs_info, struct page *page) +{ + struct extent_buffer *exists; + + /* + * For subpage case, we completely rely on radix tree to ensure we + * don't try to insert two ebs for the same bytenr. So here we always + * return NULL and just continue. + */ + if (fs_info->nodesize < PAGE_SIZE) + return NULL; + + /* Page not yet attached to an extent buffer */ + if (!PagePrivate(page)) + return NULL; + + /* + * We could have already allocated an eb for this page and attached one + * so lets see if we can get a ref on the existing eb, and if we can we + * know it's good and we can just return that one, else we know we can + * just overwrite page->private. + */ + exists = (struct extent_buffer *)page->private; + if (atomic_inc_not_zero(&exists->refs)) + return exists; + + WARN_ON(PageDirty(page)); + detach_page_private(page); + return NULL; +} + +static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) +{ + if (!IS_ALIGNED(start, fs_info->sectorsize)) { + btrfs_err(fs_info, "bad tree block start %llu", start); + return -EINVAL; + } + + if (fs_info->nodesize < PAGE_SIZE && + offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) { + btrfs_err(fs_info, + "tree block crosses page boundary, start %llu nodesize %u", + start, fs_info->nodesize); + return -EINVAL; + } + if (fs_info->nodesize >= PAGE_SIZE && + !PAGE_ALIGNED(start)) { + btrfs_err(fs_info, + "tree block is not page aligned, start %llu nodesize %u", + start, fs_info->nodesize); + return -EINVAL; + } + return 0; +} + +struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, u64 owner_root, int level) +{ + unsigned long len = fs_info->nodesize; + int num_pages; + int i; + unsigned long index = start >> PAGE_SHIFT; + struct extent_buffer *eb; + struct extent_buffer *exists = NULL; + struct page *p; + struct address_space *mapping = fs_info->btree_inode->i_mapping; + struct btrfs_subpage *prealloc = NULL; + u64 lockdep_owner = owner_root; + int uptodate = 1; + int ret; + + if (check_eb_alignment(fs_info, start)) + return ERR_PTR(-EINVAL); + +#if BITS_PER_LONG == 32 + if (start >= MAX_LFS_FILESIZE) { + btrfs_err_rl(fs_info, + "extent buffer %llu is beyond 32bit page cache limit", start); + btrfs_err_32bit_limit(fs_info); + return ERR_PTR(-EOVERFLOW); + } + if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD) + btrfs_warn_32bit_limit(fs_info); +#endif + + eb = find_extent_buffer(fs_info, start); + if (eb) + return eb; + + eb = __alloc_extent_buffer(fs_info, start, len); + if (!eb) + return ERR_PTR(-ENOMEM); + + /* + * The reloc trees are just snapshots, so we need them to appear to be + * just like any other fs tree WRT lockdep. + */ + if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID) + lockdep_owner = BTRFS_FS_TREE_OBJECTID; + + btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); + + num_pages = num_extent_pages(eb); + + /* + * Preallocate page->private for subpage case, so that we won't + * allocate memory with private_lock nor page lock hold. + * + * The memory will be freed by attach_extent_buffer_page() or freed + * manually if we exit earlier. + */ + if (fs_info->nodesize < PAGE_SIZE) { + prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); + if (IS_ERR(prealloc)) { + exists = ERR_CAST(prealloc); + goto free_eb; + } + } + + for (i = 0; i < num_pages; i++, index++) { + p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); + if (!p) { + exists = ERR_PTR(-ENOMEM); + btrfs_free_subpage(prealloc); + goto free_eb; + } + + spin_lock(&mapping->private_lock); + exists = grab_extent_buffer(fs_info, p); + if (exists) { + spin_unlock(&mapping->private_lock); + unlock_page(p); + put_page(p); + mark_extent_buffer_accessed(exists, p); + btrfs_free_subpage(prealloc); + goto free_eb; + } + /* Should not fail, as we have preallocated the memory */ + ret = attach_extent_buffer_page(eb, p, prealloc); + ASSERT(!ret); + /* + * To inform we have extra eb under allocation, so that + * detach_extent_buffer_page() won't release the page private + * when the eb hasn't yet been inserted into radix tree. + * + * The ref will be decreased when the eb released the page, in + * detach_extent_buffer_page(). + * Thus needs no special handling in error path. + */ + btrfs_page_inc_eb_refs(fs_info, p); + spin_unlock(&mapping->private_lock); + + WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); + eb->pages[i] = p; + if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len)) + uptodate = 0; + + /* + * We can't unlock the pages just yet since the extent buffer + * hasn't been properly inserted in the radix tree, this + * opens a race with btree_release_folio which can free a page + * while we are still filling in all pages for the buffer and + * we could crash. + */ + } + if (uptodate) + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); +again: + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + exists = ERR_PTR(ret); + goto free_eb; + } + + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + else + goto again; + } + /* add one reference for the tree */ + check_buffer_tree_ref(eb); + set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); + + /* + * Now it's safe to unlock the pages because any calls to + * btree_release_folio will correctly detect that a page belongs to a + * live buffer and won't free them prematurely. + */ + for (i = 0; i < num_pages; i++) + unlock_page(eb->pages[i]); + return eb; + +free_eb: + WARN_ON(!atomic_dec_and_test(&eb->refs)); + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) + unlock_page(eb->pages[i]); + } + + btrfs_release_extent_buffer(eb); + return exists; +} + +static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) +{ + struct extent_buffer *eb = + container_of(head, struct extent_buffer, rcu_head); + + __free_extent_buffer(eb); +} + +static int release_extent_buffer(struct extent_buffer *eb) + __releases(&eb->refs_lock) +{ + lockdep_assert_held(&eb->refs_lock); + + WARN_ON(atomic_read(&eb->refs) == 0); + if (atomic_dec_and_test(&eb->refs)) { + if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { + struct btrfs_fs_info *fs_info = eb->fs_info; + + spin_unlock(&eb->refs_lock); + + spin_lock(&fs_info->buffer_lock); + radix_tree_delete(&fs_info->buffer_radix, + eb->start >> fs_info->sectorsize_bits); + spin_unlock(&fs_info->buffer_lock); + } else { + spin_unlock(&eb->refs_lock); + } + + btrfs_leak_debug_del_eb(eb); + /* Should be safe to release our pages at this point */ + btrfs_release_extent_buffer_pages(eb); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { + __free_extent_buffer(eb); + return 1; + } +#endif + call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); + return 1; + } + spin_unlock(&eb->refs_lock); + + return 0; +} + +void free_extent_buffer(struct extent_buffer *eb) +{ + int refs; + if (!eb) + return; + + refs = atomic_read(&eb->refs); + while (1) { + if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) + || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && + refs == 1)) + break; + if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1)) + return; + } + + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) == 2 && + test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && + !extent_buffer_under_io(eb) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + + /* + * I know this is terrible, but it's temporary until we stop tracking + * the uptodate bits and such for the extent buffers. + */ + release_extent_buffer(eb); +} + +void free_extent_buffer_stale(struct extent_buffer *eb) +{ + if (!eb) + return; + + spin_lock(&eb->refs_lock); + set_bit(EXTENT_BUFFER_STALE, &eb->bflags); + + if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + release_extent_buffer(eb); +} + +static void btree_clear_page_dirty(struct page *page) +{ + ASSERT(PageDirty(page)); + ASSERT(PageLocked(page)); + clear_page_dirty_for_io(page); + xa_lock_irq(&page->mapping->i_pages); + if (!PageDirty(page)) + __xa_clear_mark(&page->mapping->i_pages, + page_index(page), PAGECACHE_TAG_DIRTY); + xa_unlock_irq(&page->mapping->i_pages); +} + +static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct page *page = eb->pages[0]; + bool last; + + /* btree_clear_page_dirty() needs page locked */ + lock_page(page); + last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start, + eb->len); + if (last) + btree_clear_page_dirty(page); + unlock_page(page); + WARN_ON(atomic_read(&eb->refs) == 0); +} + +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + int i; + int num_pages; + struct page *page; + + btrfs_assert_tree_write_locked(eb); + + if (trans && btrfs_header_generation(eb) != trans->transid) + return; + + if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) + return; + + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, + fs_info->dirty_metadata_batch); + + if (eb->fs_info->nodesize < PAGE_SIZE) + return clear_subpage_extent_buffer_dirty(eb); + + num_pages = num_extent_pages(eb); + + for (i = 0; i < num_pages; i++) { + page = eb->pages[i]; + if (!PageDirty(page)) + continue; + lock_page(page); + btree_clear_page_dirty(page); + unlock_page(page); + } + WARN_ON(atomic_read(&eb->refs) == 0); +} + +void set_extent_buffer_dirty(struct extent_buffer *eb) +{ + int i; + int num_pages; + bool was_dirty; + + check_buffer_tree_ref(eb); + + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + + num_pages = num_extent_pages(eb); + WARN_ON(atomic_read(&eb->refs) == 0); + WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + + if (!was_dirty) { + bool subpage = eb->fs_info->nodesize < PAGE_SIZE; + + /* + * For subpage case, we can have other extent buffers in the + * same page, and in clear_subpage_extent_buffer_dirty() we + * have to clear page dirty without subpage lock held. + * This can cause race where our page gets dirty cleared after + * we just set it. + * + * Thankfully, clear_subpage_extent_buffer_dirty() has locked + * its page for other reasons, we can use page lock to prevent + * the above race. + */ + if (subpage) + lock_page(eb->pages[0]); + for (i = 0; i < num_pages; i++) + btrfs_page_set_dirty(eb->fs_info, eb->pages[i], + eb->start, eb->len); + if (subpage) + unlock_page(eb->pages[0]); + percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, + eb->len, + eb->fs_info->dirty_metadata_batch); + } +#ifdef CONFIG_BTRFS_DEBUG + for (i = 0; i < num_pages; i++) + ASSERT(PageDirty(eb->pages[i])); +#endif +} + +void clear_extent_buffer_uptodate(struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct page *page; + int num_pages; + int i; + + clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + num_pages = num_extent_pages(eb); + for (i = 0; i < num_pages; i++) { + page = eb->pages[i]; + if (!page) + continue; + + /* + * This is special handling for metadata subpage, as regular + * btrfs_is_subpage() can not handle cloned/dummy metadata. + */ + if (fs_info->nodesize >= PAGE_SIZE) + ClearPageUptodate(page); + else + btrfs_subpage_clear_uptodate(fs_info, page, eb->start, + eb->len); + } +} + +void set_extent_buffer_uptodate(struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct page *page; + int num_pages; + int i; + + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + num_pages = num_extent_pages(eb); + for (i = 0; i < num_pages; i++) { + page = eb->pages[i]; + + /* + * This is special handling for metadata subpage, as regular + * btrfs_is_subpage() can not handle cloned/dummy metadata. + */ + if (fs_info->nodesize >= PAGE_SIZE) + SetPageUptodate(page); + else + btrfs_subpage_set_uptodate(fs_info, page, eb->start, + eb->len); + } +} + +static void extent_buffer_read_end_io(struct btrfs_bio *bbio) +{ + struct extent_buffer *eb = bbio->private; + struct btrfs_fs_info *fs_info = eb->fs_info; + bool uptodate = !bbio->bio.bi_status; + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + u32 bio_offset = 0; + + eb->read_mirror = bbio->mirror_num; + + if (uptodate && + btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0) + uptodate = false; + + if (uptodate) { + set_extent_buffer_uptodate(eb); + } else { + clear_extent_buffer_uptodate(eb); + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + } + + bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + u64 start = eb->start + bio_offset; + struct page *page = bvec->bv_page; + u32 len = bvec->bv_len; + + if (uptodate) + btrfs_page_set_uptodate(fs_info, page, start, len); + else + btrfs_page_clear_uptodate(fs_info, page, start, len); + + bio_offset += len; + } + + clear_bit(EXTENT_BUFFER_READING, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); + free_extent_buffer(eb); + + bio_put(&bbio->bio); +} + +int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, + struct btrfs_tree_parent_check *check) +{ + int num_pages = num_extent_pages(eb), i; + struct btrfs_bio *bbio; + + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + return 0; + + /* + * We could have had EXTENT_BUFFER_UPTODATE cleared by the write + * operation, which could potentially still be in flight. In this case + * we simply want to return an error. + */ + if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) + return -EIO; + + /* Someone else is already reading the buffer, just wait for it. */ + if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) + goto done; + + clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = 0; + check_buffer_tree_ref(eb); + atomic_inc(&eb->refs); + + bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, + REQ_OP_READ | REQ_META, eb->fs_info, + extent_buffer_read_end_io, eb); + bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; + bbio->inode = BTRFS_I(eb->fs_info->btree_inode); + bbio->file_offset = eb->start; + memcpy(&bbio->parent_check, check, sizeof(*check)); + if (eb->fs_info->nodesize < PAGE_SIZE) { + __bio_add_page(&bbio->bio, eb->pages[0], eb->len, + eb->start - page_offset(eb->pages[0])); + } else { + for (i = 0; i < num_pages; i++) + __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0); + } + btrfs_submit_bio(bbio, mirror_num); + +done: + if (wait == WAIT_COMPLETE) { + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + return -EIO; + } + + return 0; +} + +static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, + unsigned long len) +{ + btrfs_warn(eb->fs_info, + "access to eb bytenr %llu len %lu out of range start %lu len %lu", + eb->start, eb->len, start, len); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + + return true; +} + +/* + * Check if the [start, start + len) range is valid before reading/writing + * the eb. + * NOTE: @start and @len are offset inside the eb, not logical address. + * + * Caller should not touch the dst/src memory if this function returns error. + */ +static inline int check_eb_range(const struct extent_buffer *eb, + unsigned long start, unsigned long len) +{ + unsigned long offset; + + /* start, start + len should not go beyond eb->len nor overflow */ + if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) + return report_eb_range(eb, start, len); + + return false; +} + +void read_extent_buffer(const struct extent_buffer *eb, void *dstv, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *dst = (char *)dstv; + unsigned long i = get_eb_page_index(start); + + if (check_eb_range(eb, start, len)) { + /* + * Invalid range hit, reset the memory, so callers won't get + * some random garbage for their uninitialzed memory. + */ + memset(dstv, 0, len); + return; + } + + offset = get_eb_offset_in_page(eb, start); + + while (len > 0) { + page = eb->pages[i]; + + cur = min(len, (PAGE_SIZE - offset)); + kaddr = page_address(page); + memcpy(dst, kaddr + offset, cur); + + dst += cur; + len -= cur; + offset = 0; + i++; + } +} + +int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, + void __user *dstv, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char __user *dst = (char __user *)dstv; + unsigned long i = get_eb_page_index(start); + int ret = 0; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = get_eb_offset_in_page(eb, start); + + while (len > 0) { + page = eb->pages[i]; + + cur = min(len, (PAGE_SIZE - offset)); + kaddr = page_address(page); + if (copy_to_user_nofault(dst, kaddr + offset, cur)) { + ret = -EFAULT; + break; + } + + dst += cur; + len -= cur; + offset = 0; + i++; + } + + return ret; +} + +int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *ptr = (char *)ptrv; + unsigned long i = get_eb_page_index(start); + int ret = 0; + + if (check_eb_range(eb, start, len)) + return -EINVAL; + + offset = get_eb_offset_in_page(eb, start); + + while (len > 0) { + page = eb->pages[i]; + + cur = min(len, (PAGE_SIZE - offset)); + + kaddr = page_address(page); + ret = memcmp(ptr, kaddr + offset, cur); + if (ret) + break; + + ptr += cur; + len -= cur; + offset = 0; + i++; + } + return ret; +} + +/* + * Check that the extent buffer is uptodate. + * + * For regular sector size == PAGE_SIZE case, check if @page is uptodate. + * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE. + */ +static void assert_eb_page_uptodate(const struct extent_buffer *eb, + struct page *page) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + + /* + * If we are using the commit root we could potentially clear a page + * Uptodate while we're using the extent buffer that we've previously + * looked up. We don't want to complain in this case, as the page was + * valid before, we just didn't write it out. Instead we want to catch + * the case where we didn't actually read the block properly, which + * would have !PageUptodate and !EXTENT_BUFFER_WRITE_ERR. + */ + if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) + return; + + if (fs_info->nodesize < PAGE_SIZE) { + if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page, + eb->start, eb->len))) + btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len); + } else { + WARN_ON(!PageUptodate(page)); + } +} + +static void __write_extent_buffer(const struct extent_buffer *eb, + const void *srcv, unsigned long start, + unsigned long len, bool use_memmove) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *src = (char *)srcv; + unsigned long i = get_eb_page_index(start); + /* For unmapped (dummy) ebs, no need to check their uptodate status. */ + const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); + + WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)); + + if (check_eb_range(eb, start, len)) + return; + + offset = get_eb_offset_in_page(eb, start); + + while (len > 0) { + page = eb->pages[i]; + if (check_uptodate) + assert_eb_page_uptodate(eb, page); + + cur = min(len, PAGE_SIZE - offset); + kaddr = page_address(page); + if (use_memmove) + memmove(kaddr + offset, src, cur); + else + memcpy(kaddr + offset, src, cur); + + src += cur; + len -= cur; + offset = 0; + i++; + } +} + +void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, + unsigned long start, unsigned long len) +{ + return __write_extent_buffer(eb, srcv, start, len, false); +} + +static void memset_extent_buffer(const struct extent_buffer *eb, int c, + unsigned long start, unsigned long len) +{ + unsigned long cur = start; + + while (cur < start + len) { + unsigned long index = get_eb_page_index(cur); + unsigned int offset = get_eb_offset_in_page(eb, cur); + unsigned int cur_len = min(start + len - cur, PAGE_SIZE - offset); + struct page *page = eb->pages[index]; + + assert_eb_page_uptodate(eb, page); + memset(page_address(page) + offset, c, cur_len); + + cur += cur_len; + } +} + +void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, + unsigned long len) +{ + if (check_eb_range(eb, start, len)) + return; + return memset_extent_buffer(eb, 0, start, len); +} + +void copy_extent_buffer_full(const struct extent_buffer *dst, + const struct extent_buffer *src) +{ + unsigned long cur = 0; + + ASSERT(dst->len == src->len); + + while (cur < src->len) { + unsigned long index = get_eb_page_index(cur); + unsigned long offset = get_eb_offset_in_page(src, cur); + unsigned long cur_len = min(src->len, PAGE_SIZE - offset); + void *addr = page_address(src->pages[index]) + offset; + + write_extent_buffer(dst, addr, cur, cur_len); + + cur += cur_len; + } +} + +void copy_extent_buffer(const struct extent_buffer *dst, + const struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) +{ + u64 dst_len = dst->len; + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + unsigned long i = get_eb_page_index(dst_offset); + + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(src, src_offset, len)) + return; + + WARN_ON(src->len != dst_len); + + offset = get_eb_offset_in_page(dst, dst_offset); + + while (len > 0) { + page = dst->pages[i]; + assert_eb_page_uptodate(dst, page); + + cur = min(len, (unsigned long)(PAGE_SIZE - offset)); + + kaddr = page_address(page); + read_extent_buffer(src, kaddr + offset, src_offset, cur); + + src_offset += cur; + len -= cur; + offset = 0; + i++; + } +} + +/* + * eb_bitmap_offset() - calculate the page and offset of the byte containing the + * given bit number + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number + * @page_index: return index of the page in the extent buffer that contains the + * given bit number + * @page_offset: return offset into the page given by page_index + * + * This helper hides the ugliness of finding the byte in an extent buffer which + * contains a given bit. + */ +static inline void eb_bitmap_offset(const struct extent_buffer *eb, + unsigned long start, unsigned long nr, + unsigned long *page_index, + size_t *page_offset) +{ + size_t byte_offset = BIT_BYTE(nr); + size_t offset; + + /* + * The byte we want is the offset of the extent buffer + the offset of + * the bitmap item in the extent buffer + the offset of the byte in the + * bitmap item. + */ + offset = start + offset_in_page(eb->start) + byte_offset; + + *page_index = offset >> PAGE_SHIFT; + *page_offset = offset_in_page(offset); +} + +/* + * Determine whether a bit in a bitmap item is set. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number to test + */ +int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, + unsigned long nr) +{ + u8 *kaddr; + struct page *page; + unsigned long i; + size_t offset; + + eb_bitmap_offset(eb, start, nr, &i, &offset); + page = eb->pages[i]; + assert_eb_page_uptodate(eb, page); + kaddr = page_address(page); + return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); +} + +static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr) +{ + unsigned long index = get_eb_page_index(bytenr); + + if (check_eb_range(eb, bytenr, 1)) + return NULL; + return page_address(eb->pages[index]) + get_eb_offset_in_page(eb, bytenr); +} + +/* + * Set an area of a bitmap to 1. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to set + */ +void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len) +{ + unsigned int first_byte = start + BIT_BYTE(pos); + unsigned int last_byte = start + BIT_BYTE(pos + len - 1); + const bool same_byte = (first_byte == last_byte); + u8 mask = BITMAP_FIRST_BYTE_MASK(pos); + u8 *kaddr; + + if (same_byte) + mask &= BITMAP_LAST_BYTE_MASK(pos + len); + + /* Handle the first byte. */ + kaddr = extent_buffer_get_byte(eb, first_byte); + *kaddr |= mask; + if (same_byte) + return; + + /* Handle the byte aligned part. */ + ASSERT(first_byte + 1 <= last_byte); + memset_extent_buffer(eb, 0xff, first_byte + 1, last_byte - first_byte - 1); + + /* Handle the last byte. */ + kaddr = extent_buffer_get_byte(eb, last_byte); + *kaddr |= BITMAP_LAST_BYTE_MASK(pos + len); +} + + +/* + * Clear an area of a bitmap. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to clear + */ +void extent_buffer_bitmap_clear(const struct extent_buffer *eb, + unsigned long start, unsigned long pos, + unsigned long len) +{ + unsigned int first_byte = start + BIT_BYTE(pos); + unsigned int last_byte = start + BIT_BYTE(pos + len - 1); + const bool same_byte = (first_byte == last_byte); + u8 mask = BITMAP_FIRST_BYTE_MASK(pos); + u8 *kaddr; + + if (same_byte) + mask &= BITMAP_LAST_BYTE_MASK(pos + len); + + /* Handle the first byte. */ + kaddr = extent_buffer_get_byte(eb, first_byte); + *kaddr &= ~mask; + if (same_byte) + return; + + /* Handle the byte aligned part. */ + ASSERT(first_byte + 1 <= last_byte); + memset_extent_buffer(eb, 0, first_byte + 1, last_byte - first_byte - 1); + + /* Handle the last byte. */ + kaddr = extent_buffer_get_byte(eb, last_byte); + *kaddr &= ~BITMAP_LAST_BYTE_MASK(pos + len); +} + +static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) +{ + unsigned long distance = (src > dst) ? src - dst : dst - src; + return distance < len; +} + +void memcpy_extent_buffer(const struct extent_buffer *dst, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) +{ + unsigned long cur_off = 0; + + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(dst, src_offset, len)) + return; + + while (cur_off < len) { + unsigned long cur_src = cur_off + src_offset; + unsigned long pg_index = get_eb_page_index(cur_src); + unsigned long pg_off = get_eb_offset_in_page(dst, cur_src); + unsigned long cur_len = min(src_offset + len - cur_src, + PAGE_SIZE - pg_off); + void *src_addr = page_address(dst->pages[pg_index]) + pg_off; + const bool use_memmove = areas_overlap(src_offset + cur_off, + dst_offset + cur_off, cur_len); + + __write_extent_buffer(dst, src_addr, dst_offset + cur_off, cur_len, + use_memmove); + cur_off += cur_len; + } +} + +void memmove_extent_buffer(const struct extent_buffer *dst, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) +{ + unsigned long dst_end = dst_offset + len - 1; + unsigned long src_end = src_offset + len - 1; + + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(dst, src_offset, len)) + return; + + if (dst_offset < src_offset) { + memcpy_extent_buffer(dst, dst_offset, src_offset, len); + return; + } + + while (len > 0) { + unsigned long src_i; + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + void *src_addr; + bool use_memmove; + + src_i = get_eb_page_index(src_end); + + dst_off_in_page = get_eb_offset_in_page(dst, dst_end); + src_off_in_page = get_eb_offset_in_page(dst, src_end); + + cur = min_t(unsigned long, len, src_off_in_page + 1); + cur = min(cur, dst_off_in_page + 1); + + src_addr = page_address(dst->pages[src_i]) + src_off_in_page - + cur + 1; + use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1, + cur); + + __write_extent_buffer(dst, src_addr, dst_end - cur + 1, cur, + use_memmove); + + dst_end -= cur; + src_end -= cur; + len -= cur; + } +} + +#define GANG_LOOKUP_SIZE 16 +static struct extent_buffer *get_next_extent_buffer( + struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) +{ + struct extent_buffer *gang[GANG_LOOKUP_SIZE]; + struct extent_buffer *found = NULL; + u64 page_start = page_offset(page); + u64 cur = page_start; + + ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); + lockdep_assert_held(&fs_info->buffer_lock); + + while (cur < page_start + PAGE_SIZE) { + int ret; + int i; + + ret = radix_tree_gang_lookup(&fs_info->buffer_radix, + (void **)gang, cur >> fs_info->sectorsize_bits, + min_t(unsigned int, GANG_LOOKUP_SIZE, + PAGE_SIZE / fs_info->nodesize)); + if (ret == 0) + goto out; + for (i = 0; i < ret; i++) { + /* Already beyond page end */ + if (gang[i]->start >= page_start + PAGE_SIZE) + goto out; + /* Found one */ + if (gang[i]->start >= bytenr) { + found = gang[i]; + goto out; + } + } + cur = gang[ret - 1]->start + gang[ret - 1]->len; + } +out: + return found; +} + +static int try_release_subpage_extent_buffer(struct page *page) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + u64 cur = page_offset(page); + const u64 end = page_offset(page) + PAGE_SIZE; + int ret; + + while (cur < end) { + struct extent_buffer *eb = NULL; + + /* + * Unlike try_release_extent_buffer() which uses page->private + * to grab buffer, for subpage case we rely on radix tree, thus + * we need to ensure radix tree consistency. + * + * We also want an atomic snapshot of the radix tree, thus go + * with spinlock rather than RCU. + */ + spin_lock(&fs_info->buffer_lock); + eb = get_next_extent_buffer(fs_info, page, cur); + if (!eb) { + /* No more eb in the page range after or at cur */ + spin_unlock(&fs_info->buffer_lock); + break; + } + cur = eb->start + eb->len; + + /* + * The same as try_release_extent_buffer(), to ensure the eb + * won't disappear out from under us. + */ + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + spin_unlock(&eb->refs_lock); + spin_unlock(&fs_info->buffer_lock); + break; + } + spin_unlock(&fs_info->buffer_lock); + + /* + * If tree ref isn't set then we know the ref on this eb is a + * real ref, so just return, this eb will likely be freed soon + * anyway. + */ + if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + break; + } + + /* + * Here we don't care about the return value, we will always + * check the page private at the end. And + * release_extent_buffer() will release the refs_lock. + */ + release_extent_buffer(eb); + } + /* + * Finally to check if we have cleared page private, as if we have + * released all ebs in the page, the page private should be cleared now. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) + ret = 1; + else + ret = 0; + spin_unlock(&page->mapping->private_lock); + return ret; + +} + +int try_release_extent_buffer(struct page *page) +{ + struct extent_buffer *eb; + + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) + return try_release_subpage_extent_buffer(page); + + /* + * We need to make sure nobody is changing page->private, as we rely on + * page->private as the pointer to extent buffer. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&page->mapping->private_lock); + return 1; + } + + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + + /* + * This is a little awful but should be ok, we need to make sure that + * the eb doesn't disappear out from under us while we're looking at + * this page. + */ + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + spin_unlock(&eb->refs_lock); + spin_unlock(&page->mapping->private_lock); + return 0; + } + spin_unlock(&page->mapping->private_lock); + + /* + * If tree ref isn't set then we know the ref on this eb is a real ref, + * so just return, this page will likely be freed soon anyway. + */ + if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + return 0; + } + + return release_extent_buffer(eb); +} + +/* + * btrfs_readahead_tree_block - attempt to readahead a child block + * @fs_info: the fs_info + * @bytenr: bytenr to read + * @owner_root: objectid of the root that owns this eb + * @gen: generation for the uptodate check, can be 0 + * @level: level for the eb + * + * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a + * normal uptodate check of the eb, without checking the generation. If we have + * to read the block we will not block on anything. + */ +void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, + u64 bytenr, u64 owner_root, u64 gen, int level) +{ + struct btrfs_tree_parent_check check = { + .has_first_key = 0, + .level = level, + .transid = gen + }; + struct extent_buffer *eb; + int ret; + + eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); + if (IS_ERR(eb)) + return; + + if (btrfs_buffer_uptodate(eb, gen, 1)) { + free_extent_buffer(eb); + return; + } + + ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check); + if (ret < 0) + free_extent_buffer_stale(eb); + else + free_extent_buffer(eb); +} + +/* + * btrfs_readahead_node_child - readahead a node's child block + * @node: parent node we're reading from + * @slot: slot in the parent node for the child we want to read + * + * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at + * the slot in the node provided. + */ +void btrfs_readahead_node_child(struct extent_buffer *node, int slot) +{ + btrfs_readahead_tree_block(node->fs_info, + btrfs_node_blockptr(node, slot), + btrfs_header_owner(node), + btrfs_node_ptr_generation(node, slot), + btrfs_header_level(node) - 1); +} diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h new file mode 100644 index 0000000000..68368ba993 --- /dev/null +++ b/fs/btrfs/extent_io.h @@ -0,0 +1,313 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_EXTENT_IO_H +#define BTRFS_EXTENT_IO_H + +#include +#include +#include +#include +#include "compression.h" +#include "ulist.h" +#include "misc.h" + +struct btrfs_trans_handle; + +enum { + EXTENT_BUFFER_UPTODATE, + EXTENT_BUFFER_DIRTY, + EXTENT_BUFFER_CORRUPT, + /* this got triggered by readahead */ + EXTENT_BUFFER_READAHEAD, + EXTENT_BUFFER_TREE_REF, + EXTENT_BUFFER_STALE, + EXTENT_BUFFER_WRITEBACK, + /* read IO error */ + EXTENT_BUFFER_READ_ERR, + EXTENT_BUFFER_UNMAPPED, + EXTENT_BUFFER_IN_TREE, + /* write IO error */ + EXTENT_BUFFER_WRITE_ERR, + EXTENT_BUFFER_NO_CHECK, + /* Indicate that extent buffer pages a being read */ + EXTENT_BUFFER_READING, +}; + +/* these are flags for __process_pages_contig */ +enum { + ENUM_BIT(PAGE_UNLOCK), + /* Page starts writeback, clear dirty bit and set writeback bit */ + ENUM_BIT(PAGE_START_WRITEBACK), + ENUM_BIT(PAGE_END_WRITEBACK), + ENUM_BIT(PAGE_SET_ORDERED), +}; + +/* + * page->private values. Every page that is controlled by the extent + * map has page->private set to one. + */ +#define EXTENT_PAGE_PRIVATE 1 + +/* + * The extent buffer bitmap operations are done with byte granularity instead of + * word granularity for two reasons: + * 1. The bitmaps must be little-endian on disk. + * 2. Bitmap items are not guaranteed to be aligned to a word and therefore a + * single word in a bitmap may straddle two pages in the extent buffer. + */ +#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) +#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) +#define BITMAP_FIRST_BYTE_MASK(start) \ + ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) +#define BITMAP_LAST_BYTE_MASK(nbits) \ + (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) + +struct btrfs_root; +struct btrfs_inode; +struct btrfs_fs_info; +struct extent_io_tree; +struct btrfs_tree_parent_check; + +int __init extent_buffer_init_cachep(void); +void __cold extent_buffer_free_cachep(void); + +#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE) +struct extent_buffer { + u64 start; + unsigned long len; + unsigned long bflags; + struct btrfs_fs_info *fs_info; + spinlock_t refs_lock; + atomic_t refs; + int read_mirror; + struct rcu_head rcu_head; + pid_t lock_owner; + /* >= 0 if eb belongs to a log tree, -1 otherwise */ + s8 log_index; + + struct rw_semaphore lock; + + struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; +#ifdef CONFIG_BTRFS_DEBUG + struct list_head leak_list; +#endif +}; + +struct btrfs_eb_write_context { + struct writeback_control *wbc; + struct extent_buffer *eb; + /* Block group @eb resides in. Only used for zoned mode. */ + struct btrfs_block_group *zoned_bg; +}; + +/* + * Get the correct offset inside the page of extent buffer. + * + * @eb: target extent buffer + * @start: offset inside the extent buffer + * + * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases. + */ +static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb, + unsigned long offset) +{ + /* + * For sectorsize == PAGE_SIZE case, eb->start will always be aligned + * to PAGE_SIZE, thus adding it won't cause any difference. + * + * For sectorsize < PAGE_SIZE, we must only read the data that belongs + * to the eb, thus we have to take the eb->start into consideration. + */ + return offset_in_page(offset + eb->start); +} + +static inline unsigned long get_eb_page_index(unsigned long offset) +{ + /* + * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough. + * + * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE, + * and have ensured that all tree blocks are contained in one page, + * thus we always get index == 0. + */ + return offset >> PAGE_SHIFT; +} + +/* + * Structure to record how many bytes and which ranges are set/cleared + */ +struct extent_changeset { + /* How many bytes are set/cleared in this operation */ + u64 bytes_changed; + + /* Changed ranges */ + struct ulist range_changed; +}; + +static inline void extent_changeset_init(struct extent_changeset *changeset) +{ + changeset->bytes_changed = 0; + ulist_init(&changeset->range_changed); +} + +static inline struct extent_changeset *extent_changeset_alloc(void) +{ + struct extent_changeset *ret; + + ret = kmalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return NULL; + + extent_changeset_init(ret); + return ret; +} + +static inline void extent_changeset_release(struct extent_changeset *changeset) +{ + if (!changeset) + return; + changeset->bytes_changed = 0; + ulist_release(&changeset->range_changed); +} + +static inline void extent_changeset_free(struct extent_changeset *changeset) +{ + if (!changeset) + return; + extent_changeset_release(changeset); + kfree(changeset); +} + +struct extent_map_tree; + +int try_release_extent_mapping(struct page *page, gfp_t mask); +int try_release_extent_buffer(struct page *page); + +int btrfs_read_folio(struct file *file, struct folio *folio); +void extent_write_locked_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, struct writeback_control *wbc, + bool pages_dirty); +int extent_writepages(struct address_space *mapping, + struct writeback_control *wbc); +int btree_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc); +void extent_readahead(struct readahead_control *rac); +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); +int set_page_extent_mapped(struct page *page); +void clear_page_extent_mapped(struct page *page); + +struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, u64 owner_root, int level); +struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len); +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start); +struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src); +struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start); +void free_extent_buffer(struct extent_buffer *eb); +void free_extent_buffer_stale(struct extent_buffer *eb); +#define WAIT_NONE 0 +#define WAIT_COMPLETE 1 +#define WAIT_PAGE_LOCK 2 +int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, + struct btrfs_tree_parent_check *parent_check); +void wait_on_extent_buffer_writeback(struct extent_buffer *eb); +void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, + u64 bytenr, u64 owner_root, u64 gen, int level); +void btrfs_readahead_node_child(struct extent_buffer *node, int slot); + +static inline int num_extent_pages(const struct extent_buffer *eb) +{ + /* + * For sectorsize == PAGE_SIZE case, since nodesize is always aligned to + * sectorsize, it's just eb->len >> PAGE_SHIFT. + * + * For sectorsize < PAGE_SIZE case, we could have nodesize < PAGE_SIZE, + * thus have to ensure we get at least one page. + */ + return (eb->len >> PAGE_SHIFT) ?: 1; +} + +static inline int extent_buffer_uptodate(const struct extent_buffer *eb) +{ + return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); +} + +int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, + unsigned long start, unsigned long len); +void read_extent_buffer(const struct extent_buffer *eb, void *dst, + unsigned long start, + unsigned long len); +int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, + void __user *dst, unsigned long start, + unsigned long len); +void write_extent_buffer(const struct extent_buffer *eb, const void *src, + unsigned long start, unsigned long len); + +static inline void write_extent_buffer_chunk_tree_uuid( + const struct extent_buffer *eb, const void *chunk_tree_uuid) +{ + write_extent_buffer(eb, chunk_tree_uuid, + offsetof(struct btrfs_header, chunk_tree_uuid), + BTRFS_FSID_SIZE); +} + +static inline void write_extent_buffer_fsid(const struct extent_buffer *eb, + const void *fsid) +{ + write_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE); +} + +void copy_extent_buffer_full(const struct extent_buffer *dst, + const struct extent_buffer *src); +void copy_extent_buffer(const struct extent_buffer *dst, + const struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len); +void memcpy_extent_buffer(const struct extent_buffer *dst, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len); +void memmove_extent_buffer(const struct extent_buffer *dst, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len); +void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, + unsigned long len); +int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, + unsigned long pos); +void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len); +void extent_buffer_bitmap_clear(const struct extent_buffer *eb, + unsigned long start, unsigned long pos, + unsigned long len); +void set_extent_buffer_dirty(struct extent_buffer *eb); +void set_extent_buffer_uptodate(struct extent_buffer *eb); +void clear_extent_buffer_uptodate(struct extent_buffer *eb); +void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); +void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, + struct page *locked_page, + u32 bits_to_clear, unsigned long page_ops); +int extent_invalidate_folio(struct extent_io_tree *tree, + struct folio *folio, size_t offset); +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf); + +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +bool find_lock_delalloc_range(struct inode *inode, + struct page *locked_page, u64 *start, + u64 *end); +#endif +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start); + +#ifdef CONFIG_BTRFS_DEBUG +void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info); +#else +#define btrfs_extent_buffer_leak_debug_check(fs_info) do {} while (0) +#endif + +#endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c new file mode 100644 index 0000000000..a6d8368ed0 --- /dev/null +++ b/fs/btrfs/extent_map.c @@ -0,0 +1,1053 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include "messages.h" +#include "ctree.h" +#include "volumes.h" +#include "extent_map.h" +#include "compression.h" +#include "btrfs_inode.h" + + +static struct kmem_cache *extent_map_cache; + +int __init extent_map_init(void) +{ + extent_map_cache = kmem_cache_create("btrfs_extent_map", + sizeof(struct extent_map), 0, + SLAB_MEM_SPREAD, NULL); + if (!extent_map_cache) + return -ENOMEM; + return 0; +} + +void __cold extent_map_exit(void) +{ + kmem_cache_destroy(extent_map_cache); +} + +/* + * Initialize the extent tree @tree. Should be called for each new inode or + * other user of the extent_map interface. + */ +void extent_map_tree_init(struct extent_map_tree *tree) +{ + tree->map = RB_ROOT_CACHED; + INIT_LIST_HEAD(&tree->modified_extents); + rwlock_init(&tree->lock); +} + +/* + * Allocate a new extent_map structure. The new structure is returned with a + * reference count of one and needs to be freed using free_extent_map() + */ +struct extent_map *alloc_extent_map(void) +{ + struct extent_map *em; + em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); + if (!em) + return NULL; + RB_CLEAR_NODE(&em->rb_node); + em->compress_type = BTRFS_COMPRESS_NONE; + refcount_set(&em->refs, 1); + INIT_LIST_HEAD(&em->list); + return em; +} + +/* + * Drop the reference out on @em by one and free the structure if the reference + * count hits zero. + */ +void free_extent_map(struct extent_map *em) +{ + if (!em) + return; + if (refcount_dec_and_test(&em->refs)) { + WARN_ON(extent_map_in_tree(em)); + WARN_ON(!list_empty(&em->list)); + if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) + kfree(em->map_lookup); + kmem_cache_free(extent_map_cache, em); + } +} + +/* Do the math around the end of an extent, handling wrapping. */ +static u64 range_end(u64 start, u64 len) +{ + if (start + len < start) + return (u64)-1; + return start + len; +} + +static int tree_insert(struct rb_root_cached *root, struct extent_map *em) +{ + struct rb_node **p = &root->rb_root.rb_node; + struct rb_node *parent = NULL; + struct extent_map *entry = NULL; + struct rb_node *orig_parent = NULL; + u64 end = range_end(em->start, em->len); + bool leftmost = true; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct extent_map, rb_node); + + if (em->start < entry->start) { + p = &(*p)->rb_left; + } else if (em->start >= extent_map_end(entry)) { + p = &(*p)->rb_right; + leftmost = false; + } else { + return -EEXIST; + } + } + + orig_parent = parent; + while (parent && em->start >= extent_map_end(entry)) { + parent = rb_next(parent); + entry = rb_entry(parent, struct extent_map, rb_node); + } + if (parent) + if (end > entry->start && em->start < extent_map_end(entry)) + return -EEXIST; + + parent = orig_parent; + entry = rb_entry(parent, struct extent_map, rb_node); + while (parent && em->start < entry->start) { + parent = rb_prev(parent); + entry = rb_entry(parent, struct extent_map, rb_node); + } + if (parent) + if (end > entry->start && em->start < extent_map_end(entry)) + return -EEXIST; + + rb_link_node(&em->rb_node, orig_parent, p); + rb_insert_color_cached(&em->rb_node, root, leftmost); + return 0; +} + +/* + * Search through the tree for an extent_map with a given offset. If it can't + * be found, try to find some neighboring extents + */ +static struct rb_node *__tree_search(struct rb_root *root, u64 offset, + struct rb_node **prev_or_next_ret) +{ + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; + struct extent_map *entry; + struct extent_map *prev_entry = NULL; + + ASSERT(prev_or_next_ret); + + while (n) { + entry = rb_entry(n, struct extent_map, rb_node); + prev = n; + prev_entry = entry; + + if (offset < entry->start) + n = n->rb_left; + else if (offset >= extent_map_end(entry)) + n = n->rb_right; + else + return n; + } + + orig_prev = prev; + while (prev && offset >= extent_map_end(prev_entry)) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); + } + + /* + * Previous extent map found, return as in this case the caller does not + * care about the next one. + */ + if (prev) { + *prev_or_next_ret = prev; + return NULL; + } + + prev = orig_prev; + prev_entry = rb_entry(prev, struct extent_map, rb_node); + while (prev && offset < prev_entry->start) { + prev = rb_prev(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); + } + *prev_or_next_ret = prev; + + return NULL; +} + +/* Check to see if two extent_map structs are adjacent and safe to merge. */ +static int mergable_maps(struct extent_map *prev, struct extent_map *next) +{ + if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) + return 0; + + /* + * don't merge compressed extents, we need to know their + * actual size + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) + return 0; + + if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || + test_bit(EXTENT_FLAG_LOGGING, &next->flags)) + return 0; + + /* + * We don't want to merge stuff that hasn't been written to the log yet + * since it may not reflect exactly what is on disk, and that would be + * bad. + */ + if (!list_empty(&prev->list) || !list_empty(&next->list)) + return 0; + + ASSERT(next->block_start != EXTENT_MAP_DELALLOC && + prev->block_start != EXTENT_MAP_DELALLOC); + + if (prev->map_lookup || next->map_lookup) + ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) && + test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags)); + + if (extent_map_end(prev) == next->start && + prev->flags == next->flags && + prev->map_lookup == next->map_lookup && + ((next->block_start == EXTENT_MAP_HOLE && + prev->block_start == EXTENT_MAP_HOLE) || + (next->block_start == EXTENT_MAP_INLINE && + prev->block_start == EXTENT_MAP_INLINE) || + (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && + next->block_start == extent_map_block_end(prev)))) { + return 1; + } + return 0; +} + +static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) +{ + struct extent_map *merge = NULL; + struct rb_node *rb; + + /* + * We can't modify an extent map that is in the tree and that is being + * used by another task, as it can cause that other task to see it in + * inconsistent state during the merging. We always have 1 reference for + * the tree and 1 for this task (which is unpinning the extent map or + * clearing the logging flag), so anything > 2 means it's being used by + * other tasks too. + */ + if (refcount_read(&em->refs) > 2) + return; + + if (em->start != 0) { + rb = rb_prev(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(merge, em)) { + em->start = merge->start; + em->orig_start = merge->orig_start; + em->len += merge->len; + em->block_len += merge->block_len; + em->block_start = merge->block_start; + em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; + em->mod_start = merge->mod_start; + em->generation = max(em->generation, merge->generation); + set_bit(EXTENT_FLAG_MERGED, &em->flags); + + rb_erase_cached(&merge->rb_node, &tree->map); + RB_CLEAR_NODE(&merge->rb_node); + free_extent_map(merge); + } + } + + rb = rb_next(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(em, merge)) { + em->len += merge->len; + em->block_len += merge->block_len; + rb_erase_cached(&merge->rb_node, &tree->map); + RB_CLEAR_NODE(&merge->rb_node); + em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; + em->generation = max(em->generation, merge->generation); + set_bit(EXTENT_FLAG_MERGED, &em->flags); + free_extent_map(merge); + } +} + +/* + * Unpin an extent from the cache. + * + * @tree: tree to unpin the extent in + * @start: logical offset in the file + * @len: length of the extent + * @gen: generation that this extent has been modified in + * + * Called after an extent has been written to disk properly. Set the generation + * to the generation that actually added the file item to the inode so we know + * we need to sync this extent when we call fsync(). + */ +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, + u64 gen) +{ + int ret = 0; + struct extent_map *em; + bool prealloc = false; + + write_lock(&tree->lock); + em = lookup_extent_mapping(tree, start, len); + + WARN_ON(!em || em->start != start); + + if (!em) + goto out; + + em->generation = gen; + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + em->mod_start = em->start; + em->mod_len = em->len; + + if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { + prealloc = true; + clear_bit(EXTENT_FLAG_FILLING, &em->flags); + } + + try_merge_map(tree, em); + + if (prealloc) { + em->mod_start = em->start; + em->mod_len = em->len; + } + + free_extent_map(em); +out: + write_unlock(&tree->lock); + return ret; + +} + +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) +{ + lockdep_assert_held_write(&tree->lock); + + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + if (extent_map_in_tree(em)) + try_merge_map(tree, em); +} + +static inline void setup_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em, + int modified) +{ + refcount_inc(&em->refs); + em->mod_start = em->start; + em->mod_len = em->len; + + if (modified) + list_move(&em->list, &tree->modified_extents); + else + try_merge_map(tree, em); +} + +static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) +{ + struct map_lookup *map = em->map_lookup; + u64 stripe_size = em->orig_block_len; + int i; + + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_io_stripe *stripe = &map->stripes[i]; + struct btrfs_device *device = stripe->dev; + + set_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + stripe_size - 1, + bits | EXTENT_NOWAIT, NULL); + } +} + +static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) +{ + struct map_lookup *map = em->map_lookup; + u64 stripe_size = em->orig_block_len; + int i; + + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_io_stripe *stripe = &map->stripes[i]; + struct btrfs_device *device = stripe->dev; + + __clear_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + stripe_size - 1, + bits | EXTENT_NOWAIT, + NULL, NULL); + } +} + +/* + * Add new extent map to the extent tree + * + * @tree: tree to insert new map in + * @em: map to insert + * @modified: indicate whether the given @em should be added to the + * modified list, which indicates the extent needs to be logged + * + * Insert @em into @tree or perform a simple forward/backward merge with + * existing mappings. The extent_map struct passed in will be inserted + * into the tree directly, with an additional reference taken, or a + * reference dropped if the merge attempt was successful. + */ +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em, int modified) +{ + int ret = 0; + + lockdep_assert_held_write(&tree->lock); + + ret = tree_insert(&tree->map, em); + if (ret) + goto out; + + setup_extent_mapping(tree, em, modified); + if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) { + extent_map_device_set_bits(em, CHUNK_ALLOCATED); + extent_map_device_clear_bits(em, CHUNK_TRIMMED); + } +out: + return ret; +} + +static struct extent_map * +__lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len, int strict) +{ + struct extent_map *em; + struct rb_node *rb_node; + struct rb_node *prev_or_next = NULL; + u64 end = range_end(start, len); + + rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next); + if (!rb_node) { + if (prev_or_next) + rb_node = prev_or_next; + else + return NULL; + } + + em = rb_entry(rb_node, struct extent_map, rb_node); + + if (strict && !(end > em->start && start < extent_map_end(em))) + return NULL; + + refcount_inc(&em->refs); + return em; +} + +/* + * Lookup extent_map that intersects @start + @len range. + * + * @tree: tree to lookup in + * @start: byte offset to start the search + * @len: length of the lookup range + * + * Find and return the first extent_map struct in @tree that intersects the + * [start, len] range. There may be additional objects in the tree that + * intersect, so check the object returned carefully to make sure that no + * additional lookups are needed. + */ +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) +{ + return __lookup_extent_mapping(tree, start, len, 1); +} + +/* + * Find a nearby extent map intersecting @start + @len (not an exact search). + * + * @tree: tree to lookup in + * @start: byte offset to start the search + * @len: length of the lookup range + * + * Find and return the first extent_map struct in @tree that intersects the + * [start, len] range. + * + * If one can't be found, any nearby extent may be returned + */ +struct extent_map *search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) +{ + return __lookup_extent_mapping(tree, start, len, 0); +} + +/* + * Remove an extent_map from the extent tree. + * + * @tree: extent tree to remove from + * @em: extent map being removed + * + * Remove @em from @tree. No reference counts are dropped, and no checks + * are done to see if the range is in use. + */ +void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) +{ + lockdep_assert_held_write(&tree->lock); + + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); + rb_erase_cached(&em->rb_node, &tree->map); + if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + list_del_init(&em->list); + if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) + extent_map_device_clear_bits(em, CHUNK_ALLOCATED); + RB_CLEAR_NODE(&em->rb_node); +} + +static void replace_extent_mapping(struct extent_map_tree *tree, + struct extent_map *cur, + struct extent_map *new, + int modified) +{ + lockdep_assert_held_write(&tree->lock); + + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); + ASSERT(extent_map_in_tree(cur)); + if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) + list_del_init(&cur->list); + rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map); + RB_CLEAR_NODE(&cur->rb_node); + + setup_extent_mapping(tree, new, modified); +} + +static struct extent_map *next_extent_map(const struct extent_map *em) +{ + struct rb_node *next; + + next = rb_next(&em->rb_node); + if (!next) + return NULL; + return container_of(next, struct extent_map, rb_node); +} + +static struct extent_map *prev_extent_map(struct extent_map *em) +{ + struct rb_node *prev; + + prev = rb_prev(&em->rb_node); + if (!prev) + return NULL; + return container_of(prev, struct extent_map, rb_node); +} + +/* + * Helper for btrfs_get_extent. Given an existing extent in the tree, + * the existing extent is the nearest extent to map_start, + * and an extent that you want to insert, deal with overlap and insert + * the best fitted new extent into the tree. + */ +static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, + struct extent_map *existing, + struct extent_map *em, + u64 map_start) +{ + struct extent_map *prev; + struct extent_map *next; + u64 start; + u64 end; + u64 start_diff; + + BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); + + if (existing->start > map_start) { + next = existing; + prev = prev_extent_map(next); + } else { + prev = existing; + next = next_extent_map(prev); + } + + start = prev ? extent_map_end(prev) : em->start; + start = max_t(u64, start, em->start); + end = next ? next->start : extent_map_end(em); + end = min_t(u64, end, extent_map_end(em)); + start_diff = start - em->start; + em->start = start; + em->len = end - start; + if (em->block_start < EXTENT_MAP_LAST_BYTE && + !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + em->block_start += start_diff; + em->block_len = em->len; + } + return add_extent_mapping(em_tree, em, 0); +} + +/* + * Add extent mapping into em_tree. + * + * @fs_info: the filesystem + * @em_tree: extent tree into which we want to insert the extent mapping + * @em_in: extent we are inserting + * @start: start of the logical range btrfs_get_extent() is requesting + * @len: length of the logical range btrfs_get_extent() is requesting + * + * Note that @em_in's range may be different from [start, start+len), + * but they must be overlapped. + * + * Insert @em_in into @em_tree. In case there is an overlapping range, handle + * the -EEXIST by either: + * a) Returning the existing extent in @em_in if @start is within the + * existing em. + * b) Merge the existing extent with @em_in passed in. + * + * Return 0 on success, otherwise -EEXIST. + * + */ +int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree, + struct extent_map **em_in, u64 start, u64 len) +{ + int ret; + struct extent_map *em = *em_in; + + /* + * Tree-checker should have rejected any inline extent with non-zero + * file offset. Here just do a sanity check. + */ + if (em->block_start == EXTENT_MAP_INLINE) + ASSERT(em->start == 0); + + ret = add_extent_mapping(em_tree, em, 0); + /* it is possible that someone inserted the extent into the tree + * while we had the lock dropped. It is also possible that + * an overlapping map exists in the tree + */ + if (ret == -EEXIST) { + struct extent_map *existing; + + ret = 0; + + existing = search_extent_mapping(em_tree, start, len); + + trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); + + /* + * existing will always be non-NULL, since there must be + * extent causing the -EEXIST. + */ + if (start >= existing->start && + start < extent_map_end(existing)) { + free_extent_map(em); + *em_in = existing; + ret = 0; + } else { + u64 orig_start = em->start; + u64 orig_len = em->len; + + /* + * The existing extent map is the one nearest to + * the [start, start + len) range which overlaps + */ + ret = merge_extent_mapping(em_tree, existing, + em, start); + if (ret) { + free_extent_map(em); + *em_in = NULL; + WARN_ONCE(ret, +"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n", + ret, existing->start, existing->len, + orig_start, orig_len); + } + free_extent_map(existing); + } + } + + ASSERT(ret == 0 || ret == -EEXIST); + return ret; +} + +/* + * Drop all extent maps from a tree in the fastest possible way, rescheduling + * if needed. This avoids searching the tree, from the root down to the first + * extent map, before each deletion. + */ +static void drop_all_extent_maps_fast(struct extent_map_tree *tree) +{ + write_lock(&tree->lock); + while (!RB_EMPTY_ROOT(&tree->map.rb_root)) { + struct extent_map *em; + struct rb_node *node; + + node = rb_first_cached(&tree->map); + em = rb_entry(node, struct extent_map, rb_node); + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + remove_extent_mapping(tree, em); + free_extent_map(em); + cond_resched_rwlock_write(&tree->lock); + } + write_unlock(&tree->lock); +} + +/* + * Drop all extent maps in a given range. + * + * @inode: The target inode. + * @start: Start offset of the range. + * @end: End offset of the range (inclusive value). + * @skip_pinned: Indicate if pinned extent maps should be ignored or not. + * + * This drops all the extent maps that intersect the given range [@start, @end]. + * Extent maps that partially overlap the range and extend behind or beyond it, + * are split. + * The caller should have locked an appropriate file range in the inode's io + * tree before calling this function. + */ +void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, + bool skip_pinned) +{ + struct extent_map *split; + struct extent_map *split2; + struct extent_map *em; + struct extent_map_tree *em_tree = &inode->extent_tree; + u64 len = end - start + 1; + + WARN_ON(end < start); + if (end == (u64)-1) { + if (start == 0 && !skip_pinned) { + drop_all_extent_maps_fast(em_tree); + return; + } + len = (u64)-1; + } else { + /* Make end offset exclusive for use in the loop below. */ + end++; + } + + /* + * It's ok if we fail to allocate the extent maps, see the comment near + * the bottom of the loop below. We only need two spare extent maps in + * the worst case, where the first extent map that intersects our range + * starts before the range and the last extent map that intersects our + * range ends after our range (and they might be the same extent map), + * because we need to split those two extent maps at the boundaries. + */ + split = alloc_extent_map(); + split2 = alloc_extent_map(); + + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + + while (em) { + /* extent_map_end() returns exclusive value (last byte + 1). */ + const u64 em_end = extent_map_end(em); + struct extent_map *next_em = NULL; + u64 gen; + unsigned long flags; + bool modified; + bool compressed; + + if (em_end < end) { + next_em = next_extent_map(em); + if (next_em) { + if (next_em->start < end) + refcount_inc(&next_em->refs); + else + next_em = NULL; + } + } + + if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { + start = em_end; + goto next; + } + + flags = em->flags; + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + /* + * In case we split the extent map, we want to preserve the + * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want + * it on the new extent maps. + */ + clear_bit(EXTENT_FLAG_LOGGING, &flags); + modified = !list_empty(&em->list); + + /* + * The extent map does not cross our target range, so no need to + * split it, we can remove it directly. + */ + if (em->start >= start && em_end <= end) + goto remove_em; + + gen = em->generation; + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + + if (em->start < start) { + if (!split) { + split = split2; + split2 = NULL; + if (!split) + goto remove_em; + } + split->start = em->start; + split->len = start - em->start; + + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_start = em->orig_start; + split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + split->orig_block_len = max(split->block_len, + em->orig_block_len); + split->ram_bytes = em->ram_bytes; + } else { + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; + split->ram_bytes = split->len; + } + + split->generation = gen; + split->flags = flags; + split->compress_type = em->compress_type; + replace_extent_mapping(em_tree, em, split, modified); + free_extent_map(split); + split = split2; + split2 = NULL; + } + if (em_end > end) { + if (!split) { + split = split2; + split2 = NULL; + if (!split) + goto remove_em; + } + split->start = end; + split->len = em_end - end; + split->block_start = em->block_start; + split->flags = flags; + split->compress_type = em->compress_type; + split->generation = gen; + + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_block_len = max(em->block_len, + em->orig_block_len); + + split->ram_bytes = em->ram_bytes; + if (compressed) { + split->block_len = em->block_len; + split->orig_start = em->orig_start; + } else { + const u64 diff = start + len - em->start; + + split->block_len = split->len; + split->block_start += diff; + split->orig_start = em->orig_start; + } + } else { + split->ram_bytes = split->len; + split->orig_start = split->start; + split->block_len = 0; + split->orig_block_len = 0; + } + + if (extent_map_in_tree(em)) { + replace_extent_mapping(em_tree, em, split, + modified); + } else { + int ret; + + ret = add_extent_mapping(em_tree, split, + modified); + /* Logic error, shouldn't happen. */ + ASSERT(ret == 0); + if (WARN_ON(ret != 0) && modified) + btrfs_set_inode_full_sync(inode); + } + free_extent_map(split); + split = NULL; + } +remove_em: + if (extent_map_in_tree(em)) { + /* + * If the extent map is still in the tree it means that + * either of the following is true: + * + * 1) It fits entirely in our range (doesn't end beyond + * it or starts before it); + * + * 2) It starts before our range and/or ends after our + * range, and we were not able to allocate the extent + * maps for split operations, @split and @split2. + * + * If we are at case 2) then we just remove the entire + * extent map - this is fine since if anyone needs it to + * access the subranges outside our range, will just + * load it again from the subvolume tree's file extent + * item. However if the extent map was in the list of + * modified extents, then we must mark the inode for a + * full fsync, otherwise a fast fsync will miss this + * extent if it's new and needs to be logged. + */ + if ((em->start < start || em_end > end) && modified) { + ASSERT(!split); + btrfs_set_inode_full_sync(inode); + } + remove_extent_mapping(em_tree, em); + } + + /* + * Once for the tree reference (we replaced or removed the + * extent map from the tree). + */ + free_extent_map(em); +next: + /* Once for us (for our lookup reference). */ + free_extent_map(em); + + em = next_em; + } + + write_unlock(&em_tree->lock); + + free_extent_map(split); + free_extent_map(split2); +} + +/* + * Replace a range in the inode's extent map tree with a new extent map. + * + * @inode: The target inode. + * @new_em: The new extent map to add to the inode's extent map tree. + * @modified: Indicate if the new extent map should be added to the list of + * modified extents (for fast fsync tracking). + * + * Drops all the extent maps in the inode's extent map tree that intersect the + * range of the new extent map and adds the new extent map to the tree. + * The caller should have locked an appropriate file range in the inode's io + * tree before calling this function. + */ +int btrfs_replace_extent_map_range(struct btrfs_inode *inode, + struct extent_map *new_em, + bool modified) +{ + const u64 end = new_em->start + new_em->len - 1; + struct extent_map_tree *tree = &inode->extent_tree; + int ret; + + ASSERT(!extent_map_in_tree(new_em)); + + /* + * The caller has locked an appropriate file range in the inode's io + * tree, but getting -EEXIST when adding the new extent map can still + * happen in case there are extents that partially cover the range, and + * this is due to two tasks operating on different parts of the extent. + * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from + * btrfs_get_extent") for an example and details. + */ + do { + btrfs_drop_extent_map_range(inode, new_em->start, end, false); + write_lock(&tree->lock); + ret = add_extent_mapping(tree, new_em, modified); + write_unlock(&tree->lock); + } while (ret == -EEXIST); + + return ret; +} + +/* + * Split off the first pre bytes from the extent_map at [start, start + len], + * and set the block_start for it to new_logical. + * + * This function is used when an ordered_extent needs to be split. + */ +int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, + u64 new_logical) +{ + struct extent_map_tree *em_tree = &inode->extent_tree; + struct extent_map *em; + struct extent_map *split_pre = NULL; + struct extent_map *split_mid = NULL; + int ret = 0; + unsigned long flags; + + ASSERT(pre != 0); + ASSERT(pre < len); + + split_pre = alloc_extent_map(); + if (!split_pre) + return -ENOMEM; + split_mid = alloc_extent_map(); + if (!split_mid) { + ret = -ENOMEM; + goto out_free_pre; + } + + lock_extent(&inode->io_tree, start, start + len - 1, NULL); + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (!em) { + ret = -EIO; + goto out_unlock; + } + + ASSERT(em->len == len); + ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); + ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); + ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); + ASSERT(!list_empty(&em->list)); + + flags = em->flags; + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + + /* First, replace the em with a new extent_map starting from * em->start */ + split_pre->start = em->start; + split_pre->len = pre; + split_pre->orig_start = split_pre->start; + split_pre->block_start = new_logical; + split_pre->block_len = split_pre->len; + split_pre->orig_block_len = split_pre->block_len; + split_pre->ram_bytes = split_pre->len; + split_pre->flags = flags; + split_pre->compress_type = em->compress_type; + split_pre->generation = em->generation; + + replace_extent_mapping(em_tree, em, split_pre, 1); + + /* + * Now we only have an extent_map at: + * [em->start, em->start + pre] + */ + + /* Insert the middle extent_map. */ + split_mid->start = em->start + pre; + split_mid->len = em->len - pre; + split_mid->orig_start = split_mid->start; + split_mid->block_start = em->block_start + pre; + split_mid->block_len = split_mid->len; + split_mid->orig_block_len = split_mid->block_len; + split_mid->ram_bytes = split_mid->len; + split_mid->flags = flags; + split_mid->compress_type = em->compress_type; + split_mid->generation = em->generation; + add_extent_mapping(em_tree, split_mid, 1); + + /* Once for us */ + free_extent_map(em); + /* Once for the tree */ + free_extent_map(em); + +out_unlock: + write_unlock(&em_tree->lock); + unlock_extent(&inode->io_tree, start, start + len - 1, NULL); + free_extent_map(split_mid); +out_free_pre: + free_extent_map(split_pre); + return ret; +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h new file mode 100644 index 0000000000..35d27c756e --- /dev/null +++ b/fs/btrfs/extent_map.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_EXTENT_MAP_H +#define BTRFS_EXTENT_MAP_H + +#include +#include + +#define EXTENT_MAP_LAST_BYTE ((u64)-4) +#define EXTENT_MAP_HOLE ((u64)-3) +#define EXTENT_MAP_INLINE ((u64)-2) +/* used only during fiemap calls */ +#define EXTENT_MAP_DELALLOC ((u64)-1) + +/* bits for the extent_map::flags field */ +enum { + /* this entry not yet on disk, don't free it */ + EXTENT_FLAG_PINNED, + EXTENT_FLAG_COMPRESSED, + /* pre-allocated extent */ + EXTENT_FLAG_PREALLOC, + /* Logging this extent */ + EXTENT_FLAG_LOGGING, + /* Filling in a preallocated extent */ + EXTENT_FLAG_FILLING, + /* filesystem extent mapping type */ + EXTENT_FLAG_FS_MAPPING, + /* This em is merged from two or more physically adjacent ems */ + EXTENT_FLAG_MERGED, +}; + +struct extent_map { + struct rb_node rb_node; + + /* all of these are in bytes */ + u64 start; + u64 len; + u64 mod_start; + u64 mod_len; + u64 orig_start; + u64 orig_block_len; + u64 ram_bytes; + u64 block_start; + u64 block_len; + + /* + * Generation of the extent map, for merged em it's the highest + * generation of all merged ems. + * For non-merged extents, it's from btrfs_file_extent_item::generation. + */ + u64 generation; + unsigned long flags; + /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */ + struct map_lookup *map_lookup; + refcount_t refs; + unsigned int compress_type; + struct list_head list; +}; + +struct extent_map_tree { + struct rb_root_cached map; + struct list_head modified_extents; + rwlock_t lock; +}; + +struct btrfs_inode; + +static inline int extent_map_in_tree(const struct extent_map *em) +{ + return !RB_EMPTY_NODE(&em->rb_node); +} + +static inline u64 extent_map_end(struct extent_map *em) +{ + if (em->start + em->len < em->start) + return (u64)-1; + return em->start + em->len; +} + +static inline u64 extent_map_block_end(struct extent_map *em) +{ + if (em->block_start + em->block_len < em->block_start) + return (u64)-1; + return em->block_start + em->block_len; +} + +void extent_map_tree_init(struct extent_map_tree *tree); +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em, int modified); +void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); +int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, + u64 new_logical); + +struct extent_map *alloc_extent_map(void); +void free_extent_map(struct extent_map *em); +int __init extent_map_init(void); +void __cold extent_map_exit(void); +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); +struct extent_map *search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); +int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree, + struct extent_map **em_in, u64 start, u64 len); +void btrfs_drop_extent_map_range(struct btrfs_inode *inode, + u64 start, u64 end, + bool skip_pinned); +int btrfs_replace_extent_map_range(struct btrfs_inode *inode, + struct extent_map *new_em, + bool modified); + +#endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c new file mode 100644 index 0000000000..45cae356e8 --- /dev/null +++ b/fs/btrfs/file-item.c @@ -0,0 +1,1354 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include "messages.h" +#include "misc.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "bio.h" +#include "print-tree.h" +#include "compression.h" +#include "fs.h" +#include "accessors.h" +#include "file-item.h" +#include "super.h" + +#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) * 2) / \ + size) - 1)) + +#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ + PAGE_SIZE)) + +/* + * Set inode's size according to filesystem options. + * + * @inode: inode we want to update the disk_i_size for + * @new_i_size: i_size we want to set to, 0 if we use i_size + * + * With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read() + * returns as it is perfectly fine with a file that has holes without hole file + * extent items. + * + * However without NO_HOLES we need to only return the area that is contiguous + * from the 0 offset of the file. Otherwise we could end up adjust i_size up + * to an extent that has a gap in between. + * + * Finally new_i_size should only be set in the case of truncate where we're not + * ready to use i_size_read() as the limiter yet. + */ +void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 start, end, i_size; + int ret; + + spin_lock(&inode->lock); + i_size = new_i_size ?: i_size_read(&inode->vfs_inode); + if (btrfs_fs_incompat(fs_info, NO_HOLES)) { + inode->disk_i_size = i_size; + goto out_unlock; + } + + ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start, + &end, EXTENT_DIRTY); + if (!ret && start == 0) + i_size = min(i_size, end + 1); + else + i_size = 0; + inode->disk_i_size = i_size; +out_unlock: + spin_unlock(&inode->lock); +} + +/* + * Mark range within a file as having a new extent inserted. + * + * @inode: inode being modified + * @start: start file offset of the file extent we've inserted + * @len: logical length of the file extent item + * + * Call when we are inserting a new file extent where there was none before. + * Does not need to call this in the case where we're replacing an existing file + * extent, however if not sure it's fine to call this multiple times. + * + * The start and len must match the file extent item, so thus must be sectorsize + * aligned. + */ +int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len) +{ + if (len == 0) + return 0; + + ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize)); + + if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) + return 0; + return set_extent_bit(&inode->file_extent_tree, start, start + len - 1, + EXTENT_DIRTY, NULL); +} + +/* + * Mark an inode range as not having a backing extent. + * + * @inode: inode being modified + * @start: start file offset of the file extent we've inserted + * @len: logical length of the file extent item + * + * Called when we drop a file extent, for example when we truncate. Doesn't + * need to be called for cases where we're replacing a file extent, like when + * we've COWed a file extent. + * + * The start and len must match the file extent item, so thus must be sectorsize + * aligned. + */ +int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len) +{ + if (len == 0) + return 0; + + ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) || + len == (u64)-1); + + if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) + return 0; + return clear_extent_bit(&inode->file_extent_tree, start, + start + len - 1, EXTENT_DIRTY, NULL); +} + +static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes) +{ + ASSERT(IS_ALIGNED(bytes, fs_info->sectorsize)); + + return (bytes >> fs_info->sectorsize_bits) * fs_info->csum_size; +} + +static size_t csum_size_to_bytes(const struct btrfs_fs_info *fs_info, u32 csum_size) +{ + ASSERT(IS_ALIGNED(csum_size, fs_info->csum_size)); + + return (csum_size / fs_info->csum_size) << fs_info->sectorsize_bits; +} + +static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info) +{ + u32 max_csum_size = round_down(PAGE_SIZE - sizeof(struct btrfs_ordered_sum), + fs_info->csum_size); + + return csum_size_to_bytes(fs_info, max_csum_size); +} + +/* + * Calculate the total size needed to allocate for an ordered sum structure + * spanning @bytes in the file. + */ +static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) +{ + return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes); +} + +int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 pos, u64 num_bytes) +{ + int ret = 0; + struct btrfs_file_extent_item *item; + struct btrfs_key file_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + file_key.objectid = objectid; + file_key.offset = pos; + file_key.type = BTRFS_EXTENT_DATA_KEY; + + ret = btrfs_insert_empty_item(trans, root, path, &file_key, + sizeof(*item)); + if (ret < 0) + goto out; + BUG_ON(ret); /* Can't happen */ + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_disk_bytenr(leaf, item, 0); + btrfs_set_file_extent_disk_num_bytes(leaf, item, 0); + btrfs_set_file_extent_offset(leaf, item, 0); + btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, item, num_bytes); + btrfs_set_file_extent_generation(leaf, item, trans->transid); + btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_compression(leaf, item, 0); + btrfs_set_file_extent_encryption(leaf, item, 0); + btrfs_set_file_extent_other_encoding(leaf, item, 0); + + btrfs_mark_buffer_dirty(trans, leaf); +out: + btrfs_free_path(path); + return ret; +} + +static struct btrfs_csum_item * +btrfs_lookup_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, int cow) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + struct btrfs_key file_key; + struct btrfs_key found_key; + struct btrfs_csum_item *item; + struct extent_buffer *leaf; + u64 csum_offset = 0; + const u32 csum_size = fs_info->csum_size; + int csums_in_item; + + file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + file_key.offset = bytenr; + file_key.type = BTRFS_EXTENT_CSUM_KEY; + ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); + if (ret < 0) + goto fail; + leaf = path->nodes[0]; + if (ret > 0) { + ret = 1; + if (path->slots[0] == 0) + goto fail; + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.type != BTRFS_EXTENT_CSUM_KEY) + goto fail; + + csum_offset = (bytenr - found_key.offset) >> + fs_info->sectorsize_bits; + csums_in_item = btrfs_item_size(leaf, path->slots[0]); + csums_in_item /= csum_size; + + if (csum_offset == csums_in_item) { + ret = -EFBIG; + goto fail; + } else if (csum_offset > csums_in_item) { + goto fail; + } + } + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + item = (struct btrfs_csum_item *)((unsigned char *)item + + csum_offset * csum_size); + return item; +fail: + if (ret > 0) + ret = -ENOENT; + return ERR_PTR(ret); +} + +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 offset, int mod) +{ + struct btrfs_key file_key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + file_key.objectid = objectid; + file_key.offset = offset; + file_key.type = BTRFS_EXTENT_DATA_KEY; + + return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); +} + +/* + * Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and + * store the result to @dst. + * + * Return >0 for the number of sectors we found. + * Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum + * for it. Caller may want to try next sector until one range is hit. + * Return <0 for fatal error. + */ +static int search_csum_tree(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 disk_bytenr, + u64 len, u8 *dst) +{ + struct btrfs_root *csum_root; + struct btrfs_csum_item *item = NULL; + struct btrfs_key key; + const u32 sectorsize = fs_info->sectorsize; + const u32 csum_size = fs_info->csum_size; + u32 itemsize; + int ret; + u64 csum_start; + u64 csum_len; + + ASSERT(IS_ALIGNED(disk_bytenr, sectorsize) && + IS_ALIGNED(len, sectorsize)); + + /* Check if the current csum item covers disk_bytenr */ + if (path->nodes[0]) { + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + itemsize = btrfs_item_size(path->nodes[0], path->slots[0]); + + csum_start = key.offset; + csum_len = (itemsize / csum_size) * sectorsize; + + if (in_range(disk_bytenr, csum_start, csum_len)) + goto found; + } + + /* Current item doesn't contain the desired range, search again */ + btrfs_release_path(path); + csum_root = btrfs_csum_root(fs_info, disk_bytenr); + item = btrfs_lookup_csum(NULL, csum_root, path, disk_bytenr, 0); + if (IS_ERR(item)) { + ret = PTR_ERR(item); + goto out; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + itemsize = btrfs_item_size(path->nodes[0], path->slots[0]); + + csum_start = key.offset; + csum_len = (itemsize / csum_size) * sectorsize; + ASSERT(in_range(disk_bytenr, csum_start, csum_len)); + +found: + ret = (min(csum_start + csum_len, disk_bytenr + len) - + disk_bytenr) >> fs_info->sectorsize_bits; + read_extent_buffer(path->nodes[0], dst, (unsigned long)item, + ret * csum_size); +out: + if (ret == -ENOENT || ret == -EFBIG) + ret = 0; + return ret; +} + +/* + * Lookup the checksum for the read bio in csum tree. + * + * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. + */ +blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) +{ + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct bio *bio = &bbio->bio; + struct btrfs_path *path; + const u32 sectorsize = fs_info->sectorsize; + const u32 csum_size = fs_info->csum_size; + u32 orig_len = bio->bi_iter.bi_size; + u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; + const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; + blk_status_t ret = BLK_STS_OK; + u32 bio_offset = 0; + + if ((inode->flags & BTRFS_INODE_NODATASUM) || + test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) + return BLK_STS_OK; + + /* + * This function is only called for read bio. + * + * This means two things: + * - All our csums should only be in csum tree + * No ordered extents csums, as ordered extents are only for write + * path. + * - No need to bother any other info from bvec + * Since we're looking up csums, the only important info is the + * disk_bytenr and the length, which can be extracted from bi_iter + * directly. + */ + ASSERT(bio_op(bio) == REQ_OP_READ); + path = btrfs_alloc_path(); + if (!path) + return BLK_STS_RESOURCE; + + if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { + bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); + if (!bbio->csum) { + btrfs_free_path(path); + return BLK_STS_RESOURCE; + } + } else { + bbio->csum = bbio->csum_inline; + } + + /* + * If requested number of sectors is larger than one leaf can contain, + * kick the readahead for csum tree. + */ + if (nblocks > fs_info->csums_per_leaf) + path->reada = READA_FORWARD; + + /* + * the free space stuff is only read when it hasn't been + * updated in the current transaction. So, we can safely + * read from the commit root and sidestep a nasty deadlock + * between reading the free space cache and updating the csum tree. + */ + if (btrfs_is_free_space_inode(inode)) { + path->search_commit_root = 1; + path->skip_locking = 1; + } + + while (bio_offset < orig_len) { + int count; + u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset; + u8 *csum_dst = bbio->csum + + (bio_offset >> fs_info->sectorsize_bits) * csum_size; + + count = search_csum_tree(fs_info, path, cur_disk_bytenr, + orig_len - bio_offset, csum_dst); + if (count < 0) { + ret = errno_to_blk_status(count); + if (bbio->csum != bbio->csum_inline) + kfree(bbio->csum); + bbio->csum = NULL; + break; + } + + /* + * We didn't find a csum for this range. We need to make sure + * we complain loudly about this, because we are not NODATASUM. + * + * However for the DATA_RELOC inode we could potentially be + * relocating data extents for a NODATASUM inode, so the inode + * itself won't be marked with NODATASUM, but the extent we're + * copying is in fact NODATASUM. If we don't find a csum we + * assume this is the case. + */ + if (count == 0) { + memset(csum_dst, 0, csum_size); + count = 1; + + if (inode->root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + u64 file_offset = bbio->file_offset + bio_offset; + + set_extent_bit(&inode->io_tree, file_offset, + file_offset + sectorsize - 1, + EXTENT_NODATASUM, NULL); + } else { + btrfs_warn_rl(fs_info, + "csum hole found for disk bytenr range [%llu, %llu)", + cur_disk_bytenr, cur_disk_bytenr + sectorsize); + } + } + bio_offset += count * sectorsize; + } + + btrfs_free_path(path); + return ret; +} + +int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_ordered_sum *sums; + struct btrfs_csum_item *item; + LIST_HEAD(tmplist); + int ret; + + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(end + 1, fs_info->sectorsize)); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->nowait = nowait; + if (search_commit) { + path->skip_locking = 1; + path->reada = READA_FORWARD; + path->search_commit_root = 1; + } + + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.offset = start; + key.type = BTRFS_EXTENT_CSUM_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto fail; + if (ret > 0 && path->slots[0] > 0) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + + /* + * There are two cases we can hit here for the previous csum + * item: + * + * |<- search range ->| + * |<- csum item ->| + * + * Or + * |<- search range ->| + * |<- csum item ->| + * + * Check if the previous csum item covers the leading part of + * the search range. If so we have to start from previous csum + * item. + */ + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY) { + if (bytes_to_csum_size(fs_info, start - key.offset) < + btrfs_item_size(leaf, path->slots[0] - 1)) + path->slots[0]--; + } + } + + while (start <= end) { + u64 csum_end; + + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto fail; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset > end) + break; + + if (key.offset > start) + start = key.offset; + + csum_end = key.offset + csum_size_to_bytes(fs_info, + btrfs_item_size(leaf, path->slots[0])); + if (csum_end <= start) { + path->slots[0]++; + continue; + } + + csum_end = min(csum_end, end + 1); + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + while (start < csum_end) { + unsigned long offset; + size_t size; + + size = min_t(size_t, csum_end - start, + max_ordered_sum_bytes(fs_info)); + sums = kzalloc(btrfs_ordered_sum_size(fs_info, size), + GFP_NOFS); + if (!sums) { + ret = -ENOMEM; + goto fail; + } + + sums->logical = start; + sums->len = size; + + offset = bytes_to_csum_size(fs_info, start - key.offset); + + read_extent_buffer(path->nodes[0], + sums->sums, + ((unsigned long)item) + offset, + bytes_to_csum_size(fs_info, size)); + + start += size; + list_add_tail(&sums->list, &tmplist); + } + path->slots[0]++; + } + ret = 0; +fail: + while (ret < 0 && !list_empty(&tmplist)) { + sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list); + list_del(&sums->list); + kfree(sums); + } + list_splice_tail(&tmplist, list); + + btrfs_free_path(path); + return ret; +} + +/* + * Do the same work as btrfs_lookup_csums_list(), the difference is in how + * we return the result. + * + * This version will set the corresponding bits in @csum_bitmap to represent + * that there is a csum found. + * Each bit represents a sector. Thus caller should ensure @csum_buf passed + * in is large enough to contain all csums. + */ +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, + u64 start, u64 end, u8 *csum_buf, + unsigned long *csum_bitmap) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_csum_item *item; + const u64 orig_start = start; + bool free_path = false; + int ret; + + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(end + 1, fs_info->sectorsize)); + + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + free_path = true; + } + + /* Check if we can reuse the previous path. */ + if (path->nodes[0]) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY && + key.offset <= start) + goto search_forward; + btrfs_release_path(path); + } + + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = start; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto fail; + if (ret > 0 && path->slots[0] > 0) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + + /* + * There are two cases we can hit here for the previous csum + * item: + * + * |<- search range ->| + * |<- csum item ->| + * + * Or + * |<- search range ->| + * |<- csum item ->| + * + * Check if the previous csum item covers the leading part of + * the search range. If so we have to start from previous csum + * item. + */ + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY) { + if (bytes_to_csum_size(fs_info, start - key.offset) < + btrfs_item_size(leaf, path->slots[0] - 1)) + path->slots[0]--; + } + } + +search_forward: + while (start <= end) { + u64 csum_end; + + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto fail; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset > end) + break; + + if (key.offset > start) + start = key.offset; + + csum_end = key.offset + csum_size_to_bytes(fs_info, + btrfs_item_size(leaf, path->slots[0])); + if (csum_end <= start) { + path->slots[0]++; + continue; + } + + csum_end = min(csum_end, end + 1); + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + while (start < csum_end) { + unsigned long offset; + size_t size; + u8 *csum_dest = csum_buf + bytes_to_csum_size(fs_info, + start - orig_start); + + size = min_t(size_t, csum_end - start, end + 1 - start); + + offset = bytes_to_csum_size(fs_info, start - key.offset); + + read_extent_buffer(path->nodes[0], csum_dest, + ((unsigned long)item) + offset, + bytes_to_csum_size(fs_info, size)); + + bitmap_set(csum_bitmap, + (start - orig_start) >> fs_info->sectorsize_bits, + size >> fs_info->sectorsize_bits); + + start += size; + } + path->slots[0]++; + } + ret = 0; +fail: + if (free_path) + btrfs_free_path(path); + return ret; +} + +/* + * Calculate checksums of the data contained inside a bio. + */ +blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) +{ + struct btrfs_ordered_extent *ordered = bbio->ordered; + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + struct bio *bio = &bbio->bio; + struct btrfs_ordered_sum *sums; + char *data; + struct bvec_iter iter; + struct bio_vec bvec; + int index; + unsigned int blockcount; + int i; + unsigned nofs_flag; + + nofs_flag = memalloc_nofs_save(); + sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), + GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + + if (!sums) + return BLK_STS_RESOURCE; + + sums->len = bio->bi_iter.bi_size; + INIT_LIST_HEAD(&sums->list); + + sums->logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + index = 0; + + shash->tfm = fs_info->csum_shash; + + bio_for_each_segment(bvec, bio, iter) { + blockcount = BTRFS_BYTES_TO_BLKS(fs_info, + bvec.bv_len + fs_info->sectorsize + - 1); + + for (i = 0; i < blockcount; i++) { + data = bvec_kmap_local(&bvec); + crypto_shash_digest(shash, + data + (i * fs_info->sectorsize), + fs_info->sectorsize, + sums->sums + index); + kunmap_local(data); + index += fs_info->csum_size; + } + + } + + bbio->sums = sums; + btrfs_add_ordered_sum(ordered, sums); + return 0; +} + +/* + * Nodatasum I/O on zoned file systems still requires an btrfs_ordered_sum to + * record the updated logical address on Zone Append completion. + * Allocate just the structure with an empty sums array here for that case. + */ +blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio) +{ + bbio->sums = kmalloc(sizeof(*bbio->sums), GFP_NOFS); + if (!bbio->sums) + return BLK_STS_RESOURCE; + bbio->sums->len = bbio->bio.bi_iter.bi_size; + bbio->sums->logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + btrfs_add_ordered_sum(bbio->ordered, bbio->sums); + return 0; +} + +/* + * Remove one checksum overlapping a range. + * + * This expects the key to describe the csum pointed to by the path, and it + * expects the csum to overlap the range [bytenr, len] + * + * The csum should not be entirely contained in the range and the range should + * not be entirely contained in the csum. + * + * This calls btrfs_truncate_item with the correct args based on the overlap, + * and fixes up the key as required. + */ +static noinline void truncate_one_csum(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_key *key, + u64 bytenr, u64 len) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct extent_buffer *leaf; + const u32 csum_size = fs_info->csum_size; + u64 csum_end; + u64 end_byte = bytenr + len; + u32 blocksize_bits = fs_info->sectorsize_bits; + + leaf = path->nodes[0]; + csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size; + csum_end <<= blocksize_bits; + csum_end += key->offset; + + if (key->offset < bytenr && csum_end <= end_byte) { + /* + * [ bytenr - len ] + * [ ] + * [csum ] + * A simple truncate off the end of the item + */ + u32 new_size = (bytenr - key->offset) >> blocksize_bits; + new_size *= csum_size; + btrfs_truncate_item(trans, path, new_size, 1); + } else if (key->offset >= bytenr && csum_end > end_byte && + end_byte > key->offset) { + /* + * [ bytenr - len ] + * [ ] + * [csum ] + * we need to truncate from the beginning of the csum + */ + u32 new_size = (csum_end - end_byte) >> blocksize_bits; + new_size *= csum_size; + + btrfs_truncate_item(trans, path, new_size, 0); + + key->offset = end_byte; + btrfs_set_item_key_safe(trans, path, key); + } else { + BUG(); + } +} + +/* + * Delete the csum items from the csum tree for a given range of bytes. + */ +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_path *path; + struct btrfs_key key; + u64 end_byte = bytenr + len; + u64 csum_end; + struct extent_buffer *leaf; + int ret = 0; + const u32 csum_size = fs_info->csum_size; + u32 blocksize_bits = fs_info->sectorsize_bits; + + ASSERT(root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID || + root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.offset = end_byte - 1; + key.type = BTRFS_EXTENT_CSUM_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = 0; + if (path->slots[0] == 0) + break; + path->slots[0]--; + } else if (ret < 0) { + break; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY) { + break; + } + + if (key.offset >= end_byte) + break; + + csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size; + csum_end <<= blocksize_bits; + csum_end += key.offset; + + /* this csum ends before we start, we're done */ + if (csum_end <= bytenr) + break; + + /* delete the entire item, it is inside our range */ + if (key.offset >= bytenr && csum_end <= end_byte) { + int del_nr = 1; + + /* + * Check how many csum items preceding this one in this + * leaf correspond to our range and then delete them all + * at once. + */ + if (key.offset > bytenr && path->slots[0] > 0) { + int slot = path->slots[0] - 1; + + while (slot >= 0) { + struct btrfs_key pk; + + btrfs_item_key_to_cpu(leaf, &pk, slot); + if (pk.offset < bytenr || + pk.type != BTRFS_EXTENT_CSUM_KEY || + pk.objectid != + BTRFS_EXTENT_CSUM_OBJECTID) + break; + path->slots[0] = slot; + del_nr++; + key.offset = pk.offset; + slot--; + } + } + ret = btrfs_del_items(trans, root, path, + path->slots[0], del_nr); + if (ret) + break; + if (key.offset == bytenr) + break; + } else if (key.offset < bytenr && csum_end > end_byte) { + unsigned long offset; + unsigned long shift_len; + unsigned long item_offset; + /* + * [ bytenr - len ] + * [csum ] + * + * Our bytes are in the middle of the csum, + * we need to split this item and insert a new one. + * + * But we can't drop the path because the + * csum could change, get removed, extended etc. + * + * The trick here is the max size of a csum item leaves + * enough room in the tree block for a single + * item header. So, we split the item in place, + * adding a new header pointing to the existing + * bytes. Then we loop around again and we have + * a nicely formed csum item that we can neatly + * truncate. + */ + offset = (bytenr - key.offset) >> blocksize_bits; + offset *= csum_size; + + shift_len = (len >> blocksize_bits) * csum_size; + + item_offset = btrfs_item_ptr_offset(leaf, + path->slots[0]); + + memzero_extent_buffer(leaf, item_offset + offset, + shift_len); + key.offset = bytenr; + + /* + * btrfs_split_item returns -EAGAIN when the + * item changed size or key + */ + ret = btrfs_split_item(trans, root, path, &key, offset); + if (ret && ret != -EAGAIN) { + btrfs_abort_transaction(trans, ret); + break; + } + ret = 0; + + key.offset = end_byte - 1; + } else { + truncate_one_csum(trans, path, &key, bytenr, len); + if (key.offset < bytenr) + break; + } + btrfs_release_path(path); + } + btrfs_free_path(path); + return ret; +} + +static int find_next_csum_offset(struct btrfs_root *root, + struct btrfs_path *path, + u64 *next_offset) +{ + const u32 nritems = btrfs_header_nritems(path->nodes[0]); + struct btrfs_key found_key; + int slot = path->slots[0] + 1; + int ret; + + if (nritems == 0 || slot >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + return ret; + } else if (ret > 0) { + *next_offset = (u64)-1; + return 0; + } + slot = path->slots[0]; + } + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); + + if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + found_key.type != BTRFS_EXTENT_CSUM_KEY) + *next_offset = (u64)-1; + else + *next_offset = found_key.offset; + + return 0; +} + +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key file_key; + struct btrfs_key found_key; + struct btrfs_path *path; + struct btrfs_csum_item *item; + struct btrfs_csum_item *item_end; + struct extent_buffer *leaf = NULL; + u64 next_offset; + u64 total_bytes = 0; + u64 csum_offset; + u64 bytenr; + u32 ins_size; + int index = 0; + int found_next; + int ret; + const u32 csum_size = fs_info->csum_size; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; +again: + next_offset = (u64)-1; + found_next = 0; + bytenr = sums->logical + total_bytes; + file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + file_key.offset = bytenr; + file_key.type = BTRFS_EXTENT_CSUM_KEY; + + item = btrfs_lookup_csum(trans, root, path, bytenr, 1); + if (!IS_ERR(item)) { + ret = 0; + leaf = path->nodes[0]; + item_end = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_csum_item); + item_end = (struct btrfs_csum_item *)((char *)item_end + + btrfs_item_size(leaf, path->slots[0])); + goto found; + } + ret = PTR_ERR(item); + if (ret != -EFBIG && ret != -ENOENT) + goto out; + + if (ret == -EFBIG) { + u32 item_size; + /* we found one, but it isn't big enough yet */ + leaf = path->nodes[0]; + item_size = btrfs_item_size(leaf, path->slots[0]); + if ((item_size / csum_size) >= + MAX_CSUM_ITEMS(fs_info, csum_size)) { + /* already at max size, make a new one */ + goto insert; + } + } else { + /* We didn't find a csum item, insert one. */ + ret = find_next_csum_offset(root, path, &next_offset); + if (ret < 0) + goto out; + found_next = 1; + goto insert; + } + + /* + * At this point, we know the tree has a checksum item that ends at an + * offset matching the start of the checksum range we want to insert. + * We try to extend that item as much as possible and then add as many + * checksums to it as they fit. + * + * First check if the leaf has enough free space for at least one + * checksum. If it has go directly to the item extension code, otherwise + * release the path and do a search for insertion before the extension. + */ + if (btrfs_leaf_free_space(leaf) >= csum_size) { + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + csum_offset = (bytenr - found_key.offset) >> + fs_info->sectorsize_bits; + goto extend_csum; + } + + btrfs_release_path(path); + path->search_for_extension = 1; + ret = btrfs_search_slot(trans, root, &file_key, path, + csum_size, 1); + path->search_for_extension = 0; + if (ret < 0) + goto out; + + if (ret > 0) { + if (path->slots[0] == 0) + goto insert; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + csum_offset = (bytenr - found_key.offset) >> fs_info->sectorsize_bits; + + if (found_key.type != BTRFS_EXTENT_CSUM_KEY || + found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + csum_offset >= MAX_CSUM_ITEMS(fs_info, csum_size)) { + goto insert; + } + +extend_csum: + if (csum_offset == btrfs_item_size(leaf, path->slots[0]) / + csum_size) { + int extend_nr; + u64 tmp; + u32 diff; + + tmp = sums->len - total_bytes; + tmp >>= fs_info->sectorsize_bits; + WARN_ON(tmp < 1); + extend_nr = max_t(int, 1, tmp); + + /* + * A log tree can already have checksum items with a subset of + * the checksums we are trying to log. This can happen after + * doing a sequence of partial writes into prealloc extents and + * fsyncs in between, with a full fsync logging a larger subrange + * of an extent for which a previous fast fsync logged a smaller + * subrange. And this happens in particular due to merging file + * extent items when we complete an ordered extent for a range + * covered by a prealloc extent - this is done at + * btrfs_mark_extent_written(). + * + * So if we try to extend the previous checksum item, which has + * a range that ends at the start of the range we want to insert, + * make sure we don't extend beyond the start offset of the next + * checksum item. If we are at the last item in the leaf, then + * forget the optimization of extending and add a new checksum + * item - it is not worth the complexity of releasing the path, + * getting the first key for the next leaf, repeat the btree + * search, etc, because log trees are temporary anyway and it + * would only save a few bytes of leaf space. + */ + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + if (path->slots[0] + 1 >= + btrfs_header_nritems(path->nodes[0])) { + ret = find_next_csum_offset(root, path, &next_offset); + if (ret < 0) + goto out; + found_next = 1; + goto insert; + } + + ret = find_next_csum_offset(root, path, &next_offset); + if (ret < 0) + goto out; + + tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits; + if (tmp <= INT_MAX) + extend_nr = min_t(int, extend_nr, tmp); + } + + diff = (csum_offset + extend_nr) * csum_size; + diff = min(diff, + MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size); + + diff = diff - btrfs_item_size(leaf, path->slots[0]); + diff = min_t(u32, btrfs_leaf_free_space(leaf), diff); + diff /= csum_size; + diff *= csum_size; + + btrfs_extend_item(trans, path, diff); + ret = 0; + goto csum; + } + +insert: + btrfs_release_path(path); + csum_offset = 0; + if (found_next) { + u64 tmp; + + tmp = sums->len - total_bytes; + tmp >>= fs_info->sectorsize_bits; + tmp = min(tmp, (next_offset - file_key.offset) >> + fs_info->sectorsize_bits); + + tmp = max_t(u64, 1, tmp); + tmp = min_t(u64, tmp, MAX_CSUM_ITEMS(fs_info, csum_size)); + ins_size = csum_size * tmp; + } else { + ins_size = csum_size; + } + ret = btrfs_insert_empty_item(trans, root, path, &file_key, + ins_size); + if (ret < 0) + goto out; + if (WARN_ON(ret != 0)) + goto out; + leaf = path->nodes[0]; +csum: + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + item_end = (struct btrfs_csum_item *)((unsigned char *)item + + btrfs_item_size(leaf, path->slots[0])); + item = (struct btrfs_csum_item *)((unsigned char *)item + + csum_offset * csum_size); +found: + ins_size = (u32)(sums->len - total_bytes) >> fs_info->sectorsize_bits; + ins_size *= csum_size; + ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item, + ins_size); + write_extent_buffer(leaf, sums->sums + index, (unsigned long)item, + ins_size); + + index += ins_size; + ins_size /= csum_size; + total_bytes += ins_size * fs_info->sectorsize; + + btrfs_mark_buffer_dirty(trans, path->nodes[0]); + if (total_bytes < sums->len) { + btrfs_release_path(path); + cond_resched(); + goto again; + } +out: + btrfs_free_path(path); + return ret; +} + +void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + struct extent_map *em) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf = path->nodes[0]; + const int slot = path->slots[0]; + struct btrfs_key key; + u64 extent_start, extent_end; + u64 bytenr; + u8 type = btrfs_file_extent_type(leaf, fi); + int compress_type = btrfs_file_extent_compression(leaf, fi); + + btrfs_item_key_to_cpu(leaf, &key, slot); + extent_start = key.offset; + extent_end = btrfs_file_extent_end(path); + em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + em->generation = btrfs_file_extent_generation(leaf, fi); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + em->start = extent_start; + em->len = extent_end - extent_start; + em->orig_start = extent_start - + btrfs_file_extent_offset(leaf, fi); + em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (bytenr == 0) { + em->block_start = EXTENT_MAP_HOLE; + return; + } + if (compress_type != BTRFS_COMPRESS_NONE) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + em->block_start = bytenr; + em->block_len = em->orig_block_len; + } else { + bytenr += btrfs_file_extent_offset(leaf, fi); + em->block_start = bytenr; + em->block_len = em->len; + if (type == BTRFS_FILE_EXTENT_PREALLOC) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + em->block_start = EXTENT_MAP_INLINE; + em->start = extent_start; + em->len = extent_end - extent_start; + /* + * Initialize orig_start and block_len with the same values + * as in inode.c:btrfs_get_extent(). + */ + em->orig_start = EXTENT_MAP_HOLE; + em->block_len = (u64)-1; + em->compress_type = compress_type; + if (compress_type != BTRFS_COMPRESS_NONE) + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + } else { + btrfs_err(fs_info, + "unknown file extent item type %d, inode %llu, offset %llu, " + "root %llu", type, btrfs_ino(inode), extent_start, + root->root_key.objectid); + } +} + +/* + * Returns the end offset (non inclusive) of the file extent item the given path + * points to. If it points to an inline extent, the returned offset is rounded + * up to the sector size. + */ +u64 btrfs_file_extent_end(const struct btrfs_path *path) +{ + const struct extent_buffer *leaf = path->nodes[0]; + const int slot = path->slots[0]; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 end; + + btrfs_item_key_to_cpu(leaf, &key, slot); + ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { + end = btrfs_file_extent_ram_bytes(leaf, fi); + end = ALIGN(key.offset + end, leaf->fs_info->sectorsize); + } else { + end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + } + + return end; +} diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h new file mode 100644 index 0000000000..04bd2d34ef --- /dev/null +++ b/fs/btrfs/file-item.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_FILE_ITEM_H +#define BTRFS_FILE_ITEM_H + +#include "accessors.h" + +#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ + (offsetof(struct btrfs_file_extent_item, disk_bytenr)) + +static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info) +{ + return BTRFS_MAX_ITEM_SIZE(info) - BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +/* + * Return the number of bytes used by the item on disk, minus the size of any + * extent headers. If a file is compressed on disk, this is the compressed + * size. + */ +static inline u32 btrfs_file_extent_inline_item_len( + const struct extent_buffer *eb, + int nr) +{ + return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +static inline unsigned long btrfs_file_extent_inline_start( + const struct btrfs_file_extent_item *e) +{ + return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) +{ + return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; +} + +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len); +blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio); +int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid, u64 pos, + u64 num_bytes); +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 bytenr, int mod); +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums); +blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio); +blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio); +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait); +int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait); +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, + u64 start, u64 end, u8 *csum_buf, + unsigned long *csum_bitmap); +void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + struct extent_map *em); +int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len); +int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); +void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size); +u64 btrfs_file_extent_end(const struct btrfs_path *path); + +#endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c new file mode 100644 index 0000000000..c997b79056 --- /dev/null +++ b/fs/btrfs/file.c @@ -0,0 +1,3864 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "tree-log.h" +#include "locking.h" +#include "volumes.h" +#include "qgroup.h" +#include "compression.h" +#include "delalloc-space.h" +#include "reflink.h" +#include "subpage.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "file-item.h" +#include "ioctl.h" +#include "file.h" +#include "super.h" + +/* simple helper to fault in pages and copy. This should go away + * and be replaced with calls into generic code. + */ +static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, + struct page **prepared_pages, + struct iov_iter *i) +{ + size_t copied = 0; + size_t total_copied = 0; + int pg = 0; + int offset = offset_in_page(pos); + + while (write_bytes > 0) { + size_t count = min_t(size_t, + PAGE_SIZE - offset, write_bytes); + struct page *page = prepared_pages[pg]; + /* + * Copy data from userspace to the current page + */ + copied = copy_page_from_iter_atomic(page, offset, count, i); + + /* Flush processor's dcache for this page */ + flush_dcache_page(page); + + /* + * if we get a partial write, we can end up with + * partially up to date pages. These add + * a lot of complexity, so make sure they don't + * happen by forcing this copy to be retried. + * + * The rest of the btrfs_file_write code will fall + * back to page at a time copies after we return 0. + */ + if (unlikely(copied < count)) { + if (!PageUptodate(page)) { + iov_iter_revert(i, copied); + copied = 0; + } + if (!copied) + break; + } + + write_bytes -= copied; + total_copied += copied; + offset += copied; + if (offset == PAGE_SIZE) { + pg++; + offset = 0; + } + } + return total_copied; +} + +/* + * unlocks pages after btrfs_file_write is done with them + */ +static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, + struct page **pages, size_t num_pages, + u64 pos, u64 copied) +{ + size_t i; + u64 block_start = round_down(pos, fs_info->sectorsize); + u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; + + ASSERT(block_len <= U32_MAX); + for (i = 0; i < num_pages; i++) { + /* page checked is some magic around finding pages that + * have been modified without going through btrfs_set_page_dirty + * clear it here. There should be no need to mark the pages + * accessed as prepare_pages should have marked them accessed + * in prepare_pages via find_or_create_page() + */ + btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start, + block_len); + unlock_page(pages[i]); + put_page(pages[i]); + } +} + +/* + * After btrfs_copy_from_user(), update the following things for delalloc: + * - Mark newly dirtied pages as DELALLOC in the io tree. + * Used to advise which range is to be written back. + * - Mark modified pages as Uptodate/Dirty and not needing COW fixup + * - Update inode size for past EOF write + */ +int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, + size_t num_pages, loff_t pos, size_t write_bytes, + struct extent_state **cached, bool noreserve) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + int err = 0; + int i; + u64 num_bytes; + u64 start_pos; + u64 end_of_last_block; + u64 end_pos = pos + write_bytes; + loff_t isize = i_size_read(&inode->vfs_inode); + unsigned int extra_bits = 0; + + if (write_bytes == 0) + return 0; + + if (noreserve) + extra_bits |= EXTENT_NORESERVE; + + start_pos = round_down(pos, fs_info->sectorsize); + num_bytes = round_up(write_bytes + pos - start_pos, + fs_info->sectorsize); + ASSERT(num_bytes <= U32_MAX); + + end_of_last_block = start_pos + num_bytes - 1; + + /* + * The pages may have already been dirty, clear out old accounting so + * we can set things up properly + */ + clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + cached); + + err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, + extra_bits, cached); + if (err) + return err; + + for (i = 0; i < num_pages; i++) { + struct page *p = pages[i]; + + btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes); + btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes); + btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes); + } + + /* + * we've only changed i_size in ram, and we haven't updated + * the disk i_size. There is no need to log the inode + * at this time. + */ + if (end_pos > isize) + i_size_write(&inode->vfs_inode, end_pos); + return 0; +} + +/* + * this is very complex, but the basic idea is to drop all extents + * in the range start - end. hint_block is filled in with a block number + * that would be a good hint to the block allocator for this file. + * + * If an extent intersects the range but is not entirely inside the range + * it is either truncated or split. Anything entirely inside the range + * is deleted from the tree. + * + * Note: the VFS' inode number of bytes is not updated, it's up to the caller + * to deal with that. We set the field 'bytes_found' of the arguments structure + * with the number of allocated bytes found in the target range, so that the + * caller can update the inode's number of bytes in an atomic way when + * replacing extents in a range to avoid races with stat(2). + */ +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_drop_extents_args *args) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct btrfs_ref ref = { 0 }; + struct btrfs_key key; + struct btrfs_key new_key; + u64 ino = btrfs_ino(inode); + u64 search_start = args->start; + u64 disk_bytenr = 0; + u64 num_bytes = 0; + u64 extent_offset = 0; + u64 extent_end = 0; + u64 last_end = args->start; + int del_nr = 0; + int del_slot = 0; + int extent_type; + int recow; + int ret; + int modify_tree = -1; + int update_refs; + int found = 0; + struct btrfs_path *path = args->path; + + args->bytes_found = 0; + args->extent_inserted = false; + + /* Must always have a path if ->replace_extent is true */ + ASSERT(!(args->replace_extent && !args->path)); + + if (!path) { + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + } + + if (args->drop_cache) + btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); + + if (args->start >= inode->disk_i_size && !args->replace_extent) + modify_tree = 0; + + update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); + while (1) { + recow = 0; + ret = btrfs_lookup_file_extent(trans, root, path, ino, + search_start, modify_tree); + if (ret < 0) + break; + if (ret > 0 && path->slots[0] > 0 && search_start == args->start) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + if (key.objectid == ino && + key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + ret = 0; +next_slot: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + BUG_ON(del_nr > 0); + ret = btrfs_next_leaf(root, path); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + break; + } + leaf = path->nodes[0]; + recow = 1; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid > ino) + break; + if (WARN_ON_ONCE(key.objectid < ino) || + key.type < BTRFS_EXTENT_DATA_KEY) { + ASSERT(del_nr == 0); + path->slots[0]++; + goto next_slot; + } + if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + extent_offset = btrfs_file_extent_offset(leaf, fi); + extent_end = key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + extent_end = key.offset + + btrfs_file_extent_ram_bytes(leaf, fi); + } else { + /* can't happen */ + BUG(); + } + + /* + * Don't skip extent items representing 0 byte lengths. They + * used to be created (bug) if while punching holes we hit + * -ENOSPC condition. So if we find one here, just ensure we + * delete it, otherwise we would insert a new file extent item + * with the same key (offset) as that 0 bytes length file + * extent item in the call to setup_items_for_insert() later + * in this function. + */ + if (extent_end == key.offset && extent_end >= search_start) { + last_end = extent_end; + goto delete_extent_item; + } + + if (extent_end <= search_start) { + path->slots[0]++; + goto next_slot; + } + + found = 1; + search_start = max(key.offset, args->start); + if (recow || !modify_tree) { + modify_tree = -1; + btrfs_release_path(path); + continue; + } + + /* + * | - range to drop - | + * | -------- extent -------- | + */ + if (args->start > key.offset && args->end < extent_end) { + BUG_ON(del_nr > 0); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = args->start; + ret = btrfs_duplicate_item(trans, root, path, + &new_key); + if (ret == -EAGAIN) { + btrfs_release_path(path); + continue; + } + if (ret < 0) + break; + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + args->start - key.offset); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + extent_offset += args->start - key.offset; + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - args->start); + btrfs_mark_buffer_dirty(trans, leaf); + + if (update_refs && disk_bytenr > 0) { + btrfs_init_generic_ref(&ref, + BTRFS_ADD_DELAYED_REF, + disk_bytenr, num_bytes, 0); + btrfs_init_data_ref(&ref, + root->root_key.objectid, + new_key.objectid, + args->start - extent_offset, + 0, false); + ret = btrfs_inc_extent_ref(trans, &ref); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + } + key.offset = args->start; + } + /* + * From here on out we will have actually dropped something, so + * last_end can be updated. + */ + last_end = extent_end; + + /* + * | ---- range to drop ----- | + * | -------- extent -------- | + */ + if (args->start <= key.offset && args->end < extent_end) { + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = args->end; + btrfs_set_item_key_safe(trans, path, &new_key); + + extent_offset += args->end - key.offset; + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - args->end); + btrfs_mark_buffer_dirty(trans, leaf); + if (update_refs && disk_bytenr > 0) + args->bytes_found += args->end - key.offset; + break; + } + + search_start = extent_end; + /* + * | ---- range to drop ----- | + * | -------- extent -------- | + */ + if (args->start > key.offset && args->end >= extent_end) { + BUG_ON(del_nr > 0); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } + + btrfs_set_file_extent_num_bytes(leaf, fi, + args->start - key.offset); + btrfs_mark_buffer_dirty(trans, leaf); + if (update_refs && disk_bytenr > 0) + args->bytes_found += extent_end - args->start; + if (args->end == extent_end) + break; + + path->slots[0]++; + goto next_slot; + } + + /* + * | ---- range to drop ----- | + * | ------ extent ------ | + */ + if (args->start <= key.offset && args->end >= extent_end) { +delete_extent_item: + if (del_nr == 0) { + del_slot = path->slots[0]; + del_nr = 1; + } else { + BUG_ON(del_slot + del_nr != path->slots[0]); + del_nr++; + } + + if (update_refs && + extent_type == BTRFS_FILE_EXTENT_INLINE) { + args->bytes_found += extent_end - key.offset; + extent_end = ALIGN(extent_end, + fs_info->sectorsize); + } else if (update_refs && disk_bytenr > 0) { + btrfs_init_generic_ref(&ref, + BTRFS_DROP_DELAYED_REF, + disk_bytenr, num_bytes, 0); + btrfs_init_data_ref(&ref, + root->root_key.objectid, + key.objectid, + key.offset - extent_offset, 0, + false); + ret = btrfs_free_extent(trans, &ref); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + args->bytes_found += extent_end - key.offset; + } + + if (args->end == extent_end) + break; + + if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { + path->slots[0]++; + goto next_slot; + } + + ret = btrfs_del_items(trans, root, path, del_slot, + del_nr); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + + del_nr = 0; + del_slot = 0; + + btrfs_release_path(path); + continue; + } + + BUG(); + } + + if (!ret && del_nr > 0) { + /* + * Set path->slots[0] to first slot, so that after the delete + * if items are move off from our leaf to its immediate left or + * right neighbor leafs, we end up with a correct and adjusted + * path->slots[0] for our insertion (if args->replace_extent). + */ + path->slots[0] = del_slot; + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + if (ret) + btrfs_abort_transaction(trans, ret); + } + + leaf = path->nodes[0]; + /* + * If btrfs_del_items() was called, it might have deleted a leaf, in + * which case it unlocked our path, so check path->locks[0] matches a + * write lock. + */ + if (!ret && args->replace_extent && + path->locks[0] == BTRFS_WRITE_LOCK && + btrfs_leaf_free_space(leaf) >= + sizeof(struct btrfs_item) + args->extent_item_size) { + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = args->start; + if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { + struct btrfs_key slot_key; + + btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]); + if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) + path->slots[0]++; + } + btrfs_setup_item_for_insert(trans, root, path, &key, + args->extent_item_size); + args->extent_inserted = true; + } + + if (!args->path) + btrfs_free_path(path); + else if (!args->extent_inserted) + btrfs_release_path(path); +out: + args->drop_end = found ? min(args->end, last_end) : args->end; + + return ret; +} + +static int extent_mergeable(struct extent_buffer *leaf, int slot, + u64 objectid, u64 bytenr, u64 orig_offset, + u64 *start, u64 *end) +{ + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 extent_end; + + if (slot < 0 || slot >= btrfs_header_nritems(leaf)) + return 0; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) + return 0; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || + btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || + btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || + btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + return 0; + + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if ((*start && *start != key.offset) || (*end && *end != extent_end)) + return 0; + + *start = key.offset; + *end = extent_end; + return 1; +} + +/* + * Mark extent in the range start - end as written. + * + * This changes extent type from 'pre-allocated' to 'regular'. If only + * part of extent is marked as written, the extent will be split into + * two or three. + */ +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u64 start, u64 end) +{ + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct btrfs_ref ref = { 0 }; + struct btrfs_key key; + struct btrfs_key new_key; + u64 bytenr; + u64 num_bytes; + u64 extent_end; + u64 orig_offset; + u64 other_start; + u64 other_end; + u64 split; + int del_nr = 0; + int del_slot = 0; + int recow; + int ret = 0; + u64 ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; +again: + recow = 0; + split = start; + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = split; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0 && path->slots[0] > 0) + path->slots[0]--; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || + key.type != BTRFS_EXTENT_DATA_KEY) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if (key.offset > start || extent_end < end) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } + + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); + memcpy(&new_key, &key, sizeof(new_key)); + + if (start == key.offset && end < extent_end) { + other_start = 0; + other_end = start; + if (extent_mergeable(leaf, path->slots[0] - 1, + ino, bytenr, orig_offset, + &other_start, &other_end)) { + new_key.offset = end; + btrfs_set_item_key_safe(trans, path, &new_key); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - end); + btrfs_set_file_extent_offset(leaf, fi, + end - orig_offset); + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); + btrfs_set_file_extent_num_bytes(leaf, fi, + end - other_start); + btrfs_mark_buffer_dirty(trans, leaf); + goto out; + } + } + + if (start > key.offset && end == extent_end) { + other_start = end; + other_end = 0; + if (extent_mergeable(leaf, path->slots[0] + 1, + ino, bytenr, orig_offset, + &other_start, &other_end)) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + start - key.offset); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); + path->slots[0]++; + new_key.offset = start; + btrfs_set_item_key_safe(trans, path, &new_key); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); + btrfs_set_file_extent_num_bytes(leaf, fi, + other_end - start); + btrfs_set_file_extent_offset(leaf, fi, + start - orig_offset); + btrfs_mark_buffer_dirty(trans, leaf); + goto out; + } + } + + while (start > key.offset || end < extent_end) { + if (key.offset == start) + split = end; + + new_key.offset = split; + ret = btrfs_duplicate_item(trans, root, path, &new_key); + if (ret == -EAGAIN) { + btrfs_release_path(path); + goto again; + } + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_set_file_extent_num_bytes(leaf, fi, + split - key.offset); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - split); + btrfs_mark_buffer_dirty(trans, leaf); + + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, + num_bytes, 0); + btrfs_init_data_ref(&ref, root->root_key.objectid, ino, + orig_offset, 0, false); + ret = btrfs_inc_extent_ref(trans, &ref); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + if (split == start) { + key.offset = start; + } else { + if (start != key.offset) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } + path->slots[0]--; + extent_end = end; + } + recow = 1; + } + + other_start = end; + other_end = 0; + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, + num_bytes, 0); + btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, + 0, false); + if (extent_mergeable(leaf, path->slots[0] + 1, + ino, bytenr, orig_offset, + &other_start, &other_end)) { + if (recow) { + btrfs_release_path(path); + goto again; + } + extent_end = other_end; + del_slot = path->slots[0] + 1; + del_nr++; + ret = btrfs_free_extent(trans, &ref); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + } + other_start = 0; + other_end = start; + if (extent_mergeable(leaf, path->slots[0] - 1, + ino, bytenr, orig_offset, + &other_start, &other_end)) { + if (recow) { + btrfs_release_path(path); + goto again; + } + key.offset = other_start; + del_slot = path->slots[0]; + del_nr++; + ret = btrfs_free_extent(trans, &ref); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + } + if (del_nr == 0) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_mark_buffer_dirty(trans, leaf); + } else { + fi = btrfs_item_ptr(leaf, del_slot - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - key.offset); + btrfs_mark_buffer_dirty(trans, leaf); + + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } + } +out: + btrfs_free_path(path); + return ret; +} + +/* + * on error we return an unlocked page and the error value + * on success we return a locked page and 0 + */ +static int prepare_uptodate_page(struct inode *inode, + struct page *page, u64 pos, + bool force_uptodate) +{ + struct folio *folio = page_folio(page); + int ret = 0; + + if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && + !PageUptodate(page)) { + ret = btrfs_read_folio(NULL, folio); + if (ret) + return ret; + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + return -EIO; + } + + /* + * Since btrfs_read_folio() will unlock the folio before it + * returns, there is a window where btrfs_release_folio() can be + * called to release the page. Here we check both inode + * mapping and PagePrivate() to make sure the page was not + * released. + * + * The private flag check is essential for subpage as we need + * to store extra bitmap using page->private. + */ + if (page->mapping != inode->i_mapping || !PagePrivate(page)) { + unlock_page(page); + return -EAGAIN; + } + } + return 0; +} + +static fgf_t get_prepare_fgp_flags(bool nowait) +{ + fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; + + if (nowait) + fgp_flags |= FGP_NOWAIT; + + return fgp_flags; +} + +static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) +{ + gfp_t gfp; + + gfp = btrfs_alloc_write_mask(inode->i_mapping); + if (nowait) { + gfp &= ~__GFP_DIRECT_RECLAIM; + gfp |= GFP_NOWAIT; + } + + return gfp; +} + +/* + * this just gets pages into the page cache and locks them down. + */ +static noinline int prepare_pages(struct inode *inode, struct page **pages, + size_t num_pages, loff_t pos, + size_t write_bytes, bool force_uptodate, + bool nowait) +{ + int i; + unsigned long index = pos >> PAGE_SHIFT; + gfp_t mask = get_prepare_gfp_flags(inode, nowait); + fgf_t fgp_flags = get_prepare_fgp_flags(nowait); + int err = 0; + int faili; + + for (i = 0; i < num_pages; i++) { +again: + pages[i] = pagecache_get_page(inode->i_mapping, index + i, + fgp_flags, mask | __GFP_WRITE); + if (!pages[i]) { + faili = i - 1; + if (nowait) + err = -EAGAIN; + else + err = -ENOMEM; + goto fail; + } + + err = set_page_extent_mapped(pages[i]); + if (err < 0) { + faili = i; + goto fail; + } + + if (i == 0) + err = prepare_uptodate_page(inode, pages[i], pos, + force_uptodate); + if (!err && i == num_pages - 1) + err = prepare_uptodate_page(inode, pages[i], + pos + write_bytes, false); + if (err) { + put_page(pages[i]); + if (!nowait && err == -EAGAIN) { + err = 0; + goto again; + } + faili = i - 1; + goto fail; + } + wait_on_page_writeback(pages[i]); + } + + return 0; +fail: + while (faili >= 0) { + unlock_page(pages[faili]); + put_page(pages[faili]); + faili--; + } + return err; + +} + +/* + * This function locks the extent and properly waits for data=ordered extents + * to finish before allowing the pages to be modified if need. + * + * The return value: + * 1 - the extent is locked + * 0 - the extent is not locked, and everything is OK + * -EAGAIN - need re-prepare the pages + * the other < 0 number - Something wrong happens + */ +static noinline int +lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, + size_t num_pages, loff_t pos, + size_t write_bytes, + u64 *lockstart, u64 *lockend, bool nowait, + struct extent_state **cached_state) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 start_pos; + u64 last_pos; + int i; + int ret = 0; + + start_pos = round_down(pos, fs_info->sectorsize); + last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1; + + if (start_pos < inode->vfs_inode.i_size) { + struct btrfs_ordered_extent *ordered; + + if (nowait) { + if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, + cached_state)) { + for (i = 0; i < num_pages; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + pages[i] = NULL; + } + + return -EAGAIN; + } + } else { + lock_extent(&inode->io_tree, start_pos, last_pos, cached_state); + } + + ordered = btrfs_lookup_ordered_range(inode, start_pos, + last_pos - start_pos + 1); + if (ordered && + ordered->file_offset + ordered->num_bytes > start_pos && + ordered->file_offset <= last_pos) { + unlock_extent(&inode->io_tree, start_pos, last_pos, + cached_state); + for (i = 0; i < num_pages; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + } + btrfs_start_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + return -EAGAIN; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + + *lockstart = start_pos; + *lockend = last_pos; + ret = 1; + } + + /* + * We should be called after prepare_pages() which should have locked + * all pages in the range. + */ + for (i = 0; i < num_pages; i++) + WARN_ON(!PageLocked(pages[i])); + + return ret; +} + +/* + * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) + * + * @pos: File offset. + * @write_bytes: The length to write, will be updated to the nocow writeable + * range. + * + * This function will flush ordered extents in the range to ensure proper + * nocow checks. + * + * Return: + * > 0 If we can nocow, and updates @write_bytes. + * 0 If we can't do a nocow write. + * -EAGAIN If we can't do a nocow write because snapshoting of the inode's + * root is in progress. + * < 0 If an error happened. + * + * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0. + */ +int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes, bool nowait) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_root *root = inode->root; + struct extent_state *cached_state = NULL; + u64 lockstart, lockend; + u64 num_bytes; + int ret; + + if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + return 0; + + if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) + return -EAGAIN; + + lockstart = round_down(pos, fs_info->sectorsize); + lockend = round_up(pos + *write_bytes, + fs_info->sectorsize) - 1; + num_bytes = lockend - lockstart + 1; + + if (nowait) { + if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend, + &cached_state)) { + btrfs_drew_write_unlock(&root->snapshot_lock); + return -EAGAIN; + } + } else { + btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, + &cached_state); + } + ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, + NULL, NULL, NULL, nowait, false); + if (ret <= 0) + btrfs_drew_write_unlock(&root->snapshot_lock); + else + *write_bytes = min_t(size_t, *write_bytes , + num_bytes - pos + lockstart); + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + + return ret; +} + +void btrfs_check_nocow_unlock(struct btrfs_inode *inode) +{ + btrfs_drew_write_unlock(&inode->root->snapshot_lock); +} + +static void update_time_for_write(struct inode *inode) +{ + struct timespec64 now, ctime; + + if (IS_NOCMTIME(inode)) + return; + + now = current_time(inode); + if (!timespec64_equal(&inode->i_mtime, &now)) + inode->i_mtime = now; + + ctime = inode_get_ctime(inode); + if (!timespec64_equal(&ctime, &now)) + inode_set_ctime_to_ts(inode, now); + + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); +} + +static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, + size_t count) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + loff_t pos = iocb->ki_pos; + int ret; + loff_t oldsize; + loff_t start_pos; + + /* + * Quickly bail out on NOWAIT writes if we don't have the nodatacow or + * prealloc flags, as without those flags we always have to COW. We will + * later check if we can really COW into the target range (using + * can_nocow_extent() at btrfs_get_blocks_direct_write()). + */ + if ((iocb->ki_flags & IOCB_NOWAIT) && + !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + return -EAGAIN; + + ret = file_remove_privs(file); + if (ret) + return ret; + + /* + * We reserve space for updating the inode when we reserve space for the + * extent we are going to write, so we will enospc out there. We don't + * need to start yet another transaction to update the inode as we will + * update the inode when we finish writing whatever data we write. + */ + update_time_for_write(inode); + + start_pos = round_down(pos, fs_info->sectorsize); + oldsize = i_size_read(inode); + if (start_pos > oldsize) { + /* Expand hole size to cover write data, preventing empty gap */ + loff_t end_pos = round_up(pos + count, fs_info->sectorsize); + + ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); + if (ret) + return ret; + } + + return 0; +} + +static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, + struct iov_iter *i) +{ + struct file *file = iocb->ki_filp; + loff_t pos; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct page **pages = NULL; + struct extent_changeset *data_reserved = NULL; + u64 release_bytes = 0; + u64 lockstart; + u64 lockend; + size_t num_written = 0; + int nrptrs; + ssize_t ret; + bool only_release_metadata = false; + bool force_page_uptodate = false; + loff_t old_isize = i_size_read(inode); + unsigned int ilock_flags = 0; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); + unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); + + if (nowait) + ilock_flags |= BTRFS_ILOCK_TRY; + + ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); + if (ret < 0) + return ret; + + ret = generic_write_checks(iocb, i); + if (ret <= 0) + goto out; + + ret = btrfs_write_check(iocb, i, ret); + if (ret < 0) + goto out; + + pos = iocb->ki_pos; + nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), + PAGE_SIZE / (sizeof(struct page *))); + nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); + nrptrs = max(nrptrs, 8); + pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto out; + } + + while (iov_iter_count(i) > 0) { + struct extent_state *cached_state = NULL; + size_t offset = offset_in_page(pos); + size_t sector_offset; + size_t write_bytes = min(iov_iter_count(i), + nrptrs * (size_t)PAGE_SIZE - + offset); + size_t num_pages; + size_t reserve_bytes; + size_t dirty_pages; + size_t copied; + size_t dirty_sectors; + size_t num_sectors; + int extents_locked; + + /* + * Fault pages before locking them in prepare_pages + * to avoid recursive lock + */ + if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { + ret = -EFAULT; + break; + } + + only_release_metadata = false; + sector_offset = pos & (fs_info->sectorsize - 1); + + extent_changeset_release(data_reserved); + ret = btrfs_check_data_free_space(BTRFS_I(inode), + &data_reserved, pos, + write_bytes, nowait); + if (ret < 0) { + int can_nocow; + + if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) { + ret = -EAGAIN; + break; + } + + /* + * If we don't have to COW at the offset, reserve + * metadata only. write_bytes may get smaller than + * requested here. + */ + can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos, + &write_bytes, nowait); + if (can_nocow < 0) + ret = can_nocow; + if (can_nocow > 0) + ret = 0; + if (ret) + break; + only_release_metadata = true; + } + + num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE); + WARN_ON(num_pages > nrptrs); + reserve_bytes = round_up(write_bytes + sector_offset, + fs_info->sectorsize); + WARN_ON(reserve_bytes == 0); + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), + reserve_bytes, + reserve_bytes, nowait); + if (ret) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(BTRFS_I(inode), + data_reserved, pos, + write_bytes); + else + btrfs_check_nocow_unlock(BTRFS_I(inode)); + + if (nowait && ret == -ENOSPC) + ret = -EAGAIN; + break; + } + + release_bytes = reserve_bytes; +again: + ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags); + if (ret) { + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); + break; + } + + /* + * This is going to setup the pages array with the number of + * pages we want, so we don't really need to worry about the + * contents of pages from loop to loop + */ + ret = prepare_pages(inode, pages, num_pages, + pos, write_bytes, force_page_uptodate, false); + if (ret) { + btrfs_delalloc_release_extents(BTRFS_I(inode), + reserve_bytes); + break; + } + + extents_locked = lock_and_cleanup_extent_if_need( + BTRFS_I(inode), pages, + num_pages, pos, write_bytes, &lockstart, + &lockend, nowait, &cached_state); + if (extents_locked < 0) { + if (!nowait && extents_locked == -EAGAIN) + goto again; + + btrfs_delalloc_release_extents(BTRFS_I(inode), + reserve_bytes); + ret = extents_locked; + break; + } + + copied = btrfs_copy_from_user(pos, write_bytes, pages, i); + + num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); + dirty_sectors = round_up(copied + sector_offset, + fs_info->sectorsize); + dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors); + + /* + * if we have trouble faulting in the pages, fall + * back to one page at a time + */ + if (copied < write_bytes) + nrptrs = 1; + + if (copied == 0) { + force_page_uptodate = true; + dirty_sectors = 0; + dirty_pages = 0; + } else { + force_page_uptodate = false; + dirty_pages = DIV_ROUND_UP(copied + offset, + PAGE_SIZE); + } + + if (num_sectors > dirty_sectors) { + /* release everything except the sectors we dirtied */ + release_bytes -= dirty_sectors << fs_info->sectorsize_bits; + if (only_release_metadata) { + btrfs_delalloc_release_metadata(BTRFS_I(inode), + release_bytes, true); + } else { + u64 __pos; + + __pos = round_down(pos, + fs_info->sectorsize) + + (dirty_pages << PAGE_SHIFT); + btrfs_delalloc_release_space(BTRFS_I(inode), + data_reserved, __pos, + release_bytes, true); + } + } + + release_bytes = round_up(copied + sector_offset, + fs_info->sectorsize); + + ret = btrfs_dirty_pages(BTRFS_I(inode), pages, + dirty_pages, pos, copied, + &cached_state, only_release_metadata); + + /* + * If we have not locked the extent range, because the range's + * start offset is >= i_size, we might still have a non-NULL + * cached extent state, acquired while marking the extent range + * as delalloc through btrfs_dirty_pages(). Therefore free any + * possible cached extent state to avoid a memory leak. + */ + if (extents_locked) + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); + else + free_extent_state(cached_state); + + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); + if (ret) { + btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); + break; + } + + release_bytes = 0; + if (only_release_metadata) + btrfs_check_nocow_unlock(BTRFS_I(inode)); + + btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); + + cond_resched(); + + pos += copied; + num_written += copied; + } + + kfree(pages); + + if (release_bytes) { + if (only_release_metadata) { + btrfs_check_nocow_unlock(BTRFS_I(inode)); + btrfs_delalloc_release_metadata(BTRFS_I(inode), + release_bytes, true); + } else { + btrfs_delalloc_release_space(BTRFS_I(inode), + data_reserved, + round_down(pos, fs_info->sectorsize), + release_bytes, true); + } + } + + extent_changeset_free(data_reserved); + if (num_written > 0) { + pagecache_isize_extended(inode, old_isize, iocb->ki_pos); + iocb->ki_pos += num_written; + } +out: + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + return num_written ? num_written : ret; +} + +static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, + const struct iov_iter *iter, loff_t offset) +{ + const u32 blocksize_mask = fs_info->sectorsize - 1; + + if (offset & blocksize_mask) + return -EINVAL; + + if (iov_iter_alignment(iter) & blocksize_mask) + return -EINVAL; + + return 0; +} + +static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + loff_t pos; + ssize_t written = 0; + ssize_t written_buffered; + size_t prev_left = 0; + loff_t endbyte; + ssize_t err; + unsigned int ilock_flags = 0; + struct iomap_dio *dio; + + if (iocb->ki_flags & IOCB_NOWAIT) + ilock_flags |= BTRFS_ILOCK_TRY; + + /* + * If the write DIO is within EOF, use a shared lock and also only if + * security bits will likely not be dropped by file_remove_privs() called + * from btrfs_write_check(). Either will need to be rechecked after the + * lock was acquired. + */ + if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) + ilock_flags |= BTRFS_ILOCK_SHARED; + +relock: + err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); + if (err < 0) + return err; + + /* Shared lock cannot be used with security bits set. */ + if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + ilock_flags &= ~BTRFS_ILOCK_SHARED; + goto relock; + } + + err = generic_write_checks(iocb, from); + if (err <= 0) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + return err; + } + + err = btrfs_write_check(iocb, from, err); + if (err < 0) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + goto out; + } + + pos = iocb->ki_pos; + /* + * Re-check since file size may have changed just before taking the + * lock or pos may have changed because of O_APPEND in generic_write_check() + */ + if ((ilock_flags & BTRFS_ILOCK_SHARED) && + pos + iov_iter_count(from) > i_size_read(inode)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + ilock_flags &= ~BTRFS_ILOCK_SHARED; + goto relock; + } + + if (check_direct_IO(fs_info, from, pos)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + goto buffered; + } + + /* + * The iov_iter can be mapped to the same file range we are writing to. + * If that's the case, then we will deadlock in the iomap code, because + * it first calls our callback btrfs_dio_iomap_begin(), which will create + * an ordered extent, and after that it will fault in the pages that the + * iov_iter refers to. During the fault in we end up in the readahead + * pages code (starting at btrfs_readahead()), which will lock the range, + * find that ordered extent and then wait for it to complete (at + * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since + * obviously the ordered extent can never complete as we didn't submit + * yet the respective bio(s). This always happens when the buffer is + * memory mapped to the same file range, since the iomap DIO code always + * invalidates pages in the target file range (after starting and waiting + * for any writeback). + * + * So here we disable page faults in the iov_iter and then retry if we + * got -EFAULT, faulting in the pages before the retry. + */ + from->nofault = true; + dio = btrfs_dio_write(iocb, from, written); + from->nofault = false; + + /* + * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync + * iocb, and that needs to lock the inode. So unlock it before calling + * iomap_dio_complete() to avoid a deadlock. + */ + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + + if (IS_ERR_OR_NULL(dio)) + err = PTR_ERR_OR_ZERO(dio); + else + err = iomap_dio_complete(dio); + + /* No increment (+=) because iomap returns a cumulative value. */ + if (err > 0) + written = err; + + if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) { + const size_t left = iov_iter_count(from); + /* + * We have more data left to write. Try to fault in as many as + * possible of the remainder pages and retry. We do this without + * releasing and locking again the inode, to prevent races with + * truncate. + * + * Also, in case the iov refers to pages in the file range of the + * file we want to write to (due to a mmap), we could enter an + * infinite loop if we retry after faulting the pages in, since + * iomap will invalidate any pages in the range early on, before + * it tries to fault in the pages of the iov. So we keep track of + * how much was left of iov in the previous EFAULT and fallback + * to buffered IO in case we haven't made any progress. + */ + if (left == prev_left) { + err = -ENOTBLK; + } else { + fault_in_iov_iter_readable(from, left); + prev_left = left; + goto relock; + } + } + + /* + * If 'err' is -ENOTBLK or we have not written all data, then it means + * we must fallback to buffered IO. + */ + if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) + goto out; + +buffered: + /* + * If we are in a NOWAIT context, then return -EAGAIN to signal the caller + * it must retry the operation in a context where blocking is acceptable, + * because even if we end up not blocking during the buffered IO attempt + * below, we will block when flushing and waiting for the IO. + */ + if (iocb->ki_flags & IOCB_NOWAIT) { + err = -EAGAIN; + goto out; + } + + pos = iocb->ki_pos; + written_buffered = btrfs_buffered_write(iocb, from); + if (written_buffered < 0) { + err = written_buffered; + goto out; + } + /* + * Ensure all data is persisted. We want the next direct IO read to be + * able to read what was just written. + */ + endbyte = pos + written_buffered - 1; + err = btrfs_fdatawrite_range(inode, pos, endbyte); + if (err) + goto out; + err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); + if (err) + goto out; + written += written_buffered; + iocb->ki_pos = pos + written_buffered; + invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); +out: + return err < 0 ? err : written; +} + +static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + loff_t count; + ssize_t ret; + + btrfs_inode_lock(BTRFS_I(inode), 0); + count = encoded->len; + ret = generic_write_checks_count(iocb, &count); + if (ret == 0 && count != encoded->len) { + /* + * The write got truncated by generic_write_checks_count(). We + * can't do a partial encoded write. + */ + ret = -EFBIG; + } + if (ret || encoded->len == 0) + goto out; + + ret = btrfs_write_check(iocb, from, encoded->len); + if (ret < 0) + goto out; + + ret = btrfs_do_encoded_write(iocb, from, encoded); +out: + btrfs_inode_unlock(BTRFS_I(inode), 0); + return ret; +} + +ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded) +{ + struct file *file = iocb->ki_filp; + struct btrfs_inode *inode = BTRFS_I(file_inode(file)); + ssize_t num_written, num_sync; + + /* + * If the fs flips readonly due to some impossible error, although we + * have opened a file as writable, we have to stop this write operation + * to ensure consistency. + */ + if (BTRFS_FS_ERROR(inode->root->fs_info)) + return -EROFS; + + if (encoded && (iocb->ki_flags & IOCB_NOWAIT)) + return -EOPNOTSUPP; + + if (encoded) { + num_written = btrfs_encoded_write(iocb, from, encoded); + num_sync = encoded->len; + } else if (iocb->ki_flags & IOCB_DIRECT) { + num_written = btrfs_direct_write(iocb, from); + num_sync = num_written; + } else { + num_written = btrfs_buffered_write(iocb, from); + num_sync = num_written; + } + + btrfs_set_inode_last_sub_trans(inode); + + if (num_sync > 0) { + num_sync = generic_write_sync(iocb, num_sync); + if (num_sync < 0) + num_written = num_sync; + } + + return num_written; +} + +static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + return btrfs_do_write_iter(iocb, from, NULL); +} + +int btrfs_release_file(struct inode *inode, struct file *filp) +{ + struct btrfs_file_private *private = filp->private_data; + + if (private) { + kfree(private->filldir_buf); + free_extent_state(private->llseek_cached_state); + kfree(private); + filp->private_data = NULL; + } + + /* + * Set by setattr when we are about to truncate a file from a non-zero + * size to a zero size. This tries to flush down new bytes that may + * have been written if the application were using truncate to replace + * a file in place. + */ + if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + return 0; +} + +static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) +{ + int ret; + struct blk_plug plug; + + /* + * This is only called in fsync, which would do synchronous writes, so + * a plug can merge adjacent IOs as much as possible. Esp. in case of + * multiple disks using raid profile, a large IO can be split to + * several segments of stripe length (currently 64K). + */ + blk_start_plug(&plug); + ret = btrfs_fdatawrite_range(inode, start, end); + blk_finish_plug(&plug); + + return ret; +} + +static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) +{ + struct btrfs_inode *inode = BTRFS_I(ctx->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (btrfs_inode_in_log(inode, fs_info->generation) && + list_empty(&ctx->ordered_extents)) + return true; + + /* + * If we are doing a fast fsync we can not bail out if the inode's + * last_trans is <= then the last committed transaction, because we only + * update the last_trans of the inode during ordered extent completion, + * and for a fast fsync we don't wait for that, we only wait for the + * writeback to complete. + */ + if (inode->last_trans <= fs_info->last_trans_committed && + (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || + list_empty(&ctx->ordered_extents))) + return true; + + return false; +} + +/* + * fsync call for both files and directories. This logs the inode into + * the tree log instead of forcing full commits whenever possible. + * + * It needs to call filemap_fdatawait so that all ordered extent updates are + * in the metadata btree are up to date for copying to the log. + * + * It drops the inode mutex before doing the tree log commit. This is an + * important optimization for directories because holding the mutex prevents + * new operations on the dir while we write to disk. + */ +int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct dentry *dentry = file_dentry(file); + struct inode *inode = d_inode(dentry); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_log_ctx ctx; + int ret = 0, err; + u64 len; + bool full_sync; + + trace_btrfs_sync_file(file, datasync); + + btrfs_init_log_ctx(&ctx, inode); + + /* + * Always set the range to a full range, otherwise we can get into + * several problems, from missing file extent items to represent holes + * when not using the NO_HOLES feature, to log tree corruption due to + * races between hole detection during logging and completion of ordered + * extents outside the range, to missing checksums due to ordered extents + * for which we flushed only a subset of their pages. + */ + start = 0; + end = LLONG_MAX; + len = (u64)LLONG_MAX + 1; + + /* + * We write the dirty pages in the range and wait until they complete + * out of the ->i_mutex. If so, we can flush the dirty pages by + * multi-task, and make the performance up. See + * btrfs_wait_ordered_range for an explanation of the ASYNC check. + */ + ret = start_ordered_ops(inode, start, end); + if (ret) + goto out; + + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + + atomic_inc(&root->log_batch); + + /* + * Before we acquired the inode's lock and the mmap lock, someone may + * have dirtied more pages in the target range. We need to make sure + * that writeback for any such pages does not start while we are logging + * the inode, because if it does, any of the following might happen when + * we are not doing a full inode sync: + * + * 1) We log an extent after its writeback finishes but before its + * checksums are added to the csum tree, leading to -EIO errors + * when attempting to read the extent after a log replay. + * + * 2) We can end up logging an extent before its writeback finishes. + * Therefore after the log replay we will have a file extent item + * pointing to an unwritten extent (and no data checksums as well). + * + * So trigger writeback for any eventual new dirty pages and then we + * wait for all ordered extents to complete below. + */ + ret = start_ordered_ops(inode, start, end); + if (ret) { + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + goto out; + } + + /* + * Always check for the full sync flag while holding the inode's lock, + * to avoid races with other tasks. The flag must be either set all the + * time during logging or always off all the time while logging. + * We check the flag here after starting delalloc above, because when + * running delalloc the full sync flag may be set if we need to drop + * extra extent map ranges due to temporary memory allocation failures. + */ + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + + /* + * We have to do this here to avoid the priority inversion of waiting on + * IO of a lower priority task while holding a transaction open. + * + * For a full fsync we wait for the ordered extents to complete while + * for a fast fsync we wait just for writeback to complete, and then + * attach the ordered extents to the transaction so that a transaction + * commit waits for their completion, to avoid data loss if we fsync, + * the current transaction commits before the ordered extents complete + * and a power failure happens right after that. + * + * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the + * logical address recorded in the ordered extent may change. We need + * to wait for the IO to stabilize the logical address. + */ + if (full_sync || btrfs_is_zoned(fs_info)) { + ret = btrfs_wait_ordered_range(inode, start, len); + } else { + /* + * Get our ordered extents as soon as possible to avoid doing + * checksum lookups in the csum tree, and use instead the + * checksums attached to the ordered extents. + */ + btrfs_get_ordered_extents_for_logging(BTRFS_I(inode), + &ctx.ordered_extents); + ret = filemap_fdatawait_range(inode->i_mapping, start, end); + } + + if (ret) + goto out_release_extents; + + atomic_inc(&root->log_batch); + + smp_mb(); + if (skip_inode_logging(&ctx)) { + /* + * We've had everything committed since the last time we were + * modified so clear this flag in case it was set for whatever + * reason, it's no longer relevant. + */ + clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + /* + * An ordered extent might have started before and completed + * already with io errors, in which case the inode was not + * updated and we end up here. So check the inode's mapping + * for any errors that might have happened since we last + * checked called fsync. + */ + ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); + goto out_release_extents; + } + + /* + * We use start here because we will need to wait on the IO to complete + * in btrfs_sync_log, which could require joining a transaction (for + * example checking cross references in the nocow path). If we use join + * here we could get into a situation where we're waiting on IO to + * happen that is blocked on a transaction trying to commit. With start + * we inc the extwriter counter, so we wait for all extwriters to exit + * before we start blocking joiners. This comment is to keep somebody + * from thinking they are super smart and changing this to + * btrfs_join_transaction *cough*Josef*cough*. + */ + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_release_extents; + } + trans->in_fsync = true; + + ret = btrfs_log_dentry_safe(trans, dentry, &ctx); + btrfs_release_log_ctx_extents(&ctx); + if (ret < 0) { + /* Fallthrough and commit/free transaction. */ + ret = BTRFS_LOG_FORCE_COMMIT; + } + + /* we've logged all the items and now have a consistent + * version of the file in the log. It is possible that + * someone will come in and modify the file, but that's + * fine because the log is consistent on disk, and we + * have references to all of the file's extents + * + * It is possible that someone will come in and log the + * file again, but that will end up using the synchronization + * inside btrfs_sync_log to keep things safe. + */ + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + + if (ret == BTRFS_NO_LOG_SYNC) { + ret = btrfs_end_transaction(trans); + goto out; + } + + /* We successfully logged the inode, attempt to sync the log. */ + if (!ret) { + ret = btrfs_sync_log(trans, root, &ctx); + if (!ret) { + ret = btrfs_end_transaction(trans); + goto out; + } + } + + /* + * At this point we need to commit the transaction because we had + * btrfs_need_log_full_commit() or some other error. + * + * If we didn't do a full sync we have to stop the trans handle, wait on + * the ordered extents, start it again and commit the transaction. If + * we attempt to wait on the ordered extents here we could deadlock with + * something like fallocate() that is holding the extent lock trying to + * start a transaction while some other thread is trying to commit the + * transaction while we (fsync) are currently holding the transaction + * open. + */ + if (!full_sync) { + ret = btrfs_end_transaction(trans); + if (ret) + goto out; + ret = btrfs_wait_ordered_range(inode, start, len); + if (ret) + goto out; + + /* + * This is safe to use here because we're only interested in + * making sure the transaction that had the ordered extents is + * committed. We aren't waiting on anything past this point, + * we're purely getting the transaction and committing it. + */ + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + + /* + * We committed the transaction and there's no currently + * running transaction, this means everything we care + * about made it to disk and we are done. + */ + if (ret == -ENOENT) + ret = 0; + goto out; + } + } + + ret = btrfs_commit_transaction(trans); +out: + ASSERT(list_empty(&ctx.list)); + ASSERT(list_empty(&ctx.conflict_inodes)); + err = file_check_and_advance_wb_err(file); + if (!ret) + ret = err; + return ret > 0 ? -EIO : ret; + +out_release_extents: + btrfs_release_log_ctx_extents(&ctx); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + goto out; +} + +static const struct vm_operations_struct btrfs_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = btrfs_page_mkwrite, +}; + +static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct address_space *mapping = filp->f_mapping; + + if (!mapping->a_ops->read_folio) + return -ENOEXEC; + + file_accessed(filp); + vma->vm_ops = &btrfs_file_vm_ops; + + return 0; +} + +static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, + int slot, u64 start, u64 end) +{ + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + + if (slot < 0 || slot >= btrfs_header_nritems(leaf)) + return 0; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) + return 0; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) + return 0; + + if (btrfs_file_extent_disk_bytenr(leaf, fi)) + return 0; + + if (key.offset == end) + return 1; + if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) + return 1; + return 0; +} + +static int fill_holes(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, u64 offset, u64 end) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct extent_map *hole_em; + struct btrfs_key key; + int ret; + + if (btrfs_fs_incompat(fs_info, NO_HOLES)) + goto out; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = offset; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret <= 0) { + /* + * We should have dropped this offset, so if we find it then + * something has gone horribly wrong. + */ + if (ret == 0) + ret = -EINVAL; + return ret; + } + + leaf = path->nodes[0]; + if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) { + u64 num_bytes; + + path->slots[0]--; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + + end - offset; + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_mark_buffer_dirty(trans, leaf); + goto out; + } + + if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) { + u64 num_bytes; + + key.offset = offset; + btrfs_set_item_key_safe(trans, path, &key); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - + offset; + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_mark_buffer_dirty(trans, leaf); + goto out; + } + btrfs_release_path(path); + + ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, + end - offset); + if (ret) + return ret; + +out: + btrfs_release_path(path); + + hole_em = alloc_extent_map(); + if (!hole_em) { + btrfs_drop_extent_map_range(inode, offset, end - 1, false); + btrfs_set_inode_full_sync(inode); + } else { + hole_em->start = offset; + hole_em->len = end - offset; + hole_em->ram_bytes = hole_em->len; + hole_em->orig_start = offset; + + hole_em->block_start = EXTENT_MAP_HOLE; + hole_em->block_len = 0; + hole_em->orig_block_len = 0; + hole_em->compress_type = BTRFS_COMPRESS_NONE; + hole_em->generation = trans->transid; + + ret = btrfs_replace_extent_map_range(inode, hole_em, true); + free_extent_map(hole_em); + if (ret) + btrfs_set_inode_full_sync(inode); + } + + return 0; +} + +/* + * Find a hole extent on given inode and change start/len to the end of hole + * extent.(hole/vacuum extent whose em->start <= start && + * em->start + em->len > start) + * When a hole extent is found, return 1 and modify start/len. + */ +static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map *em; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, + round_down(*start, fs_info->sectorsize), + round_up(*len, fs_info->sectorsize)); + if (IS_ERR(em)) + return PTR_ERR(em); + + /* Hole or vacuum extent(only exists in no-hole mode) */ + if (em->block_start == EXTENT_MAP_HOLE) { + ret = 1; + *len = em->start + em->len > *start + *len ? + 0 : *start + *len - em->start - em->len; + *start = em->start + em->len; + } + free_extent_map(em); + return ret; +} + +static void btrfs_punch_hole_lock_range(struct inode *inode, + const u64 lockstart, + const u64 lockend, + struct extent_state **cached_state) +{ + /* + * For subpage case, if the range is not at page boundary, we could + * have pages at the leading/tailing part of the range. + * This could lead to dead loop since filemap_range_has_page() + * will always return true. + * So here we need to do extra page alignment for + * filemap_range_has_page(). + */ + const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); + const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; + + while (1) { + truncate_pagecache_range(inode, lockstart, lockend); + + lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); + /* + * We can't have ordered extents in the range, nor dirty/writeback + * pages, because we have locked the inode's VFS lock in exclusive + * mode, we have locked the inode's i_mmap_lock in exclusive mode, + * we have flushed all delalloc in the range and we have waited + * for any ordered extents in the range to complete. + * We can race with anyone reading pages from this range, so after + * locking the range check if we have pages in the range, and if + * we do, unlock the range and retry. + */ + if (!filemap_range_has_page(inode->i_mapping, page_lockstart, + page_lockend)) + break; + + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); + } + + btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); +} + +static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + struct btrfs_replace_extent_info *extent_info, + const u64 replace_len, + const u64 bytes_to_drop) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *extent; + struct extent_buffer *leaf; + struct btrfs_key key; + int slot; + struct btrfs_ref ref = { 0 }; + int ret; + + if (replace_len == 0) + return 0; + + if (extent_info->disk_offset == 0 && + btrfs_fs_incompat(fs_info, NO_HOLES)) { + btrfs_update_inode_bytes(inode, 0, bytes_to_drop); + return 0; + } + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = extent_info->file_offset; + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_file_extent_item)); + if (ret) + return ret; + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, extent_info->extent_buf, + btrfs_item_ptr_offset(leaf, slot), + sizeof(struct btrfs_file_extent_item)); + extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset); + btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); + if (extent_info->is_new_extent) + btrfs_set_file_extent_generation(leaf, extent, trans->transid); + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_release_path(path); + + ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, + replace_len); + if (ret) + return ret; + + /* If it's a hole, nothing more needs to be done. */ + if (extent_info->disk_offset == 0) { + btrfs_update_inode_bytes(inode, 0, bytes_to_drop); + return 0; + } + + btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop); + + if (extent_info->is_new_extent && extent_info->insertions == 0) { + key.objectid = extent_info->disk_offset; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = extent_info->disk_len; + ret = btrfs_alloc_reserved_file_extent(trans, root, + btrfs_ino(inode), + extent_info->file_offset, + extent_info->qgroup_reserved, + &key); + } else { + u64 ref_offset; + + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, + extent_info->disk_offset, + extent_info->disk_len, 0); + ref_offset = extent_info->file_offset - extent_info->data_offset; + btrfs_init_data_ref(&ref, root->root_key.objectid, + btrfs_ino(inode), ref_offset, 0, false); + ret = btrfs_inc_extent_ref(trans, &ref); + } + + extent_info->insertions++; + + return ret; +} + +/* + * The respective range must have been previously locked, as well as the inode. + * The end offset is inclusive (last byte of the range). + * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing + * the file range with an extent. + * When not punching a hole, we don't want to end up in a state where we dropped + * extents without inserting a new one, so we must abort the transaction to avoid + * a corruption. + */ +int btrfs_replace_file_extents(struct btrfs_inode *inode, + struct btrfs_path *path, const u64 start, + const u64 end, + struct btrfs_replace_extent_info *extent_info, + struct btrfs_trans_handle **trans_out) +{ + struct btrfs_drop_extents_args drop_args = { 0 }; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); + u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize); + struct btrfs_trans_handle *trans = NULL; + struct btrfs_block_rsv *rsv; + unsigned int rsv_count; + u64 cur_offset; + u64 len = end - start; + int ret = 0; + + if (end <= start) + return -EINVAL; + + rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); + if (!rsv) { + ret = -ENOMEM; + goto out; + } + rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1); + rsv->failfast = true; + + /* + * 1 - update the inode + * 1 - removing the extents in the range + * 1 - adding the hole extent if no_holes isn't set or if we are + * replacing the range with a new extent + */ + if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info) + rsv_count = 3; + else + rsv_count = 2; + + trans = btrfs_start_transaction(root, rsv_count); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out_free; + } + + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, + min_size, false); + if (WARN_ON(ret)) + goto out_trans; + trans->block_rsv = rsv; + + cur_offset = start; + drop_args.path = path; + drop_args.end = end + 1; + drop_args.drop_cache = true; + while (cur_offset < end) { + drop_args.start = cur_offset; + ret = btrfs_drop_extents(trans, root, inode, &drop_args); + /* If we are punching a hole decrement the inode's byte count */ + if (!extent_info) + btrfs_update_inode_bytes(inode, 0, + drop_args.bytes_found); + if (ret != -ENOSPC) { + /* + * The only time we don't want to abort is if we are + * attempting to clone a partial inline extent, in which + * case we'll get EOPNOTSUPP. However if we aren't + * clone we need to abort no matter what, because if we + * got EOPNOTSUPP via prealloc then we messed up and + * need to abort. + */ + if (ret && + (ret != -EOPNOTSUPP || + (extent_info && extent_info->is_new_extent))) + btrfs_abort_transaction(trans, ret); + break; + } + + trans->block_rsv = &fs_info->trans_block_rsv; + + if (!extent_info && cur_offset < drop_args.drop_end && + cur_offset < ino_size) { + ret = fill_holes(trans, inode, path, cur_offset, + drop_args.drop_end); + if (ret) { + /* + * If we failed then we didn't insert our hole + * entries for the area we dropped, so now the + * fs is corrupted, so we must abort the + * transaction. + */ + btrfs_abort_transaction(trans, ret); + break; + } + } else if (!extent_info && cur_offset < drop_args.drop_end) { + /* + * We are past the i_size here, but since we didn't + * insert holes we need to clear the mapped area so we + * know to not set disk_i_size in this area until a new + * file extent is inserted here. + */ + ret = btrfs_inode_clear_file_extent_range(inode, + cur_offset, + drop_args.drop_end - cur_offset); + if (ret) { + /* + * We couldn't clear our area, so we could + * presumably adjust up and corrupt the fs, so + * we need to abort. + */ + btrfs_abort_transaction(trans, ret); + break; + } + } + + if (extent_info && + drop_args.drop_end > extent_info->file_offset) { + u64 replace_len = drop_args.drop_end - + extent_info->file_offset; + + ret = btrfs_insert_replace_extent(trans, inode, path, + extent_info, replace_len, + drop_args.bytes_found); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + extent_info->data_len -= replace_len; + extent_info->data_offset += replace_len; + extent_info->file_offset += replace_len; + } + + /* + * We are releasing our handle on the transaction, balance the + * dirty pages of the btree inode and flush delayed items, and + * then get a new transaction handle, which may now point to a + * new transaction in case someone else may have committed the + * transaction we used to replace/drop file extent items. So + * bump the inode's iversion and update mtime and ctime except + * if we are called from a dedupe context. This is because a + * power failure/crash may happen after the transaction is + * committed and before we finish replacing/dropping all the + * file extent items we need. + */ + inode_inc_iversion(&inode->vfs_inode); + + if (!extent_info || extent_info->update_times) + inode->vfs_inode.i_mtime = inode_set_ctime_current(&inode->vfs_inode); + + ret = btrfs_update_inode(trans, root, inode); + if (ret) + break; + + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); + + trans = btrfs_start_transaction(root, rsv_count); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + break; + } + + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, + rsv, min_size, false); + if (WARN_ON(ret)) + break; + trans->block_rsv = rsv; + + cur_offset = drop_args.drop_end; + len = end - cur_offset; + if (!extent_info && len) { + ret = find_first_non_hole(inode, &cur_offset, &len); + if (unlikely(ret < 0)) + break; + if (ret && !len) { + ret = 0; + break; + } + } + } + + /* + * If we were cloning, force the next fsync to be a full one since we + * we replaced (or just dropped in the case of cloning holes when + * NO_HOLES is enabled) file extent items and did not setup new extent + * maps for the replacement extents (or holes). + */ + if (extent_info && !extent_info->is_new_extent) + btrfs_set_inode_full_sync(inode); + + if (ret) + goto out_trans; + + trans->block_rsv = &fs_info->trans_block_rsv; + /* + * If we are using the NO_HOLES feature we might have had already an + * hole that overlaps a part of the region [lockstart, lockend] and + * ends at (or beyond) lockend. Since we have no file extent items to + * represent holes, drop_end can be less than lockend and so we must + * make sure we have an extent map representing the existing hole (the + * call to __btrfs_drop_extents() might have dropped the existing extent + * map representing the existing hole), otherwise the fast fsync path + * will not record the existence of the hole region + * [existing_hole_start, lockend]. + */ + if (drop_args.drop_end <= end) + drop_args.drop_end = end + 1; + /* + * Don't insert file hole extent item if it's for a range beyond eof + * (because it's useless) or if it represents a 0 bytes range (when + * cur_offset == drop_end). + */ + if (!extent_info && cur_offset < ino_size && + cur_offset < drop_args.drop_end) { + ret = fill_holes(trans, inode, path, cur_offset, + drop_args.drop_end); + if (ret) { + /* Same comment as above. */ + btrfs_abort_transaction(trans, ret); + goto out_trans; + } + } else if (!extent_info && cur_offset < drop_args.drop_end) { + /* See the comment in the loop above for the reasoning here. */ + ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, + drop_args.drop_end - cur_offset); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_trans; + } + + } + if (extent_info) { + ret = btrfs_insert_replace_extent(trans, inode, path, + extent_info, extent_info->data_len, + drop_args.bytes_found); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_trans; + } + } + +out_trans: + if (!trans) + goto out_free; + + trans->block_rsv = &fs_info->trans_block_rsv; + if (ret) + btrfs_end_transaction(trans); + else + *trans_out = trans; +out_free: + btrfs_free_block_rsv(fs_info, rsv); +out: + return ret; +} + +static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_state *cached_state = NULL; + struct btrfs_path *path; + struct btrfs_trans_handle *trans = NULL; + u64 lockstart; + u64 lockend; + u64 tail_start; + u64 tail_len; + u64 orig_start = offset; + int ret = 0; + bool same_block; + u64 ino_size; + bool truncated_block = false; + bool updated_inode = false; + + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + + ret = btrfs_wait_ordered_range(inode, offset, len); + if (ret) + goto out_only_mutex; + + ino_size = round_up(inode->i_size, fs_info->sectorsize); + ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + /* Already in a large hole */ + ret = 0; + goto out_only_mutex; + } + + ret = file_modified(file); + if (ret) + goto out_only_mutex; + + lockstart = round_up(offset, fs_info->sectorsize); + lockend = round_down(offset + len, fs_info->sectorsize) - 1; + same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) + == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); + /* + * We needn't truncate any block which is beyond the end of the file + * because we are sure there is no data there. + */ + /* + * Only do this if we are in the same block and we aren't doing the + * entire block. + */ + if (same_block && len < fs_info->sectorsize) { + if (offset < ino_size) { + truncated_block = true; + ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, + 0); + } else { + ret = 0; + } + goto out_only_mutex; + } + + /* zero back part of the first block */ + if (offset < ino_size) { + truncated_block = true; + ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); + if (ret) { + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + return ret; + } + } + + /* Check the aligned pages after the first unaligned page, + * if offset != orig_start, which means the first unaligned page + * including several following pages are already in holes, + * the extra check can be skipped */ + if (offset == orig_start) { + /* after truncate page, check hole again */ + len = offset + len - lockstart; + offset = lockstart; + ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + ret = 0; + goto out_only_mutex; + } + lockstart = offset; + } + + /* Check the tail unaligned part is in a hole */ + tail_start = lockend + 1; + tail_len = offset + len - tail_start; + if (tail_len) { + ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len); + if (unlikely(ret < 0)) + goto out_only_mutex; + if (!ret) { + /* zero the front end of the last page */ + if (tail_start + tail_len < ino_size) { + truncated_block = true; + ret = btrfs_truncate_block(BTRFS_I(inode), + tail_start + tail_len, + 0, 1); + if (ret) + goto out_only_mutex; + } + } + } + + if (lockend < lockstart) { + ret = 0; + goto out_only_mutex; + } + + btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart, + lockend, NULL, &trans); + btrfs_free_path(path); + if (ret) + goto out; + + ASSERT(trans != NULL); + inode_inc_iversion(inode); + inode->i_mtime = inode_set_ctime_current(inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + updated_inode = true; + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); +out: + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); +out_only_mutex: + if (!updated_inode && truncated_block && !ret) { + /* + * If we only end up zeroing part of a page, we still need to + * update the inode item, so that all the time fields are + * updated as well as the necessary btrfs inode in memory fields + * for detecting, at fsync time, if the inode isn't yet in the + * log tree or it's there but not up to date. + */ + struct timespec64 now = inode_set_ctime_current(inode); + + inode_inc_iversion(inode); + inode->i_mtime = now; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + } else { + int ret2; + + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret2 = btrfs_end_transaction(trans); + if (!ret) + ret = ret2; + } + } + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + return ret; +} + +/* Helper structure to record which range is already reserved */ +struct falloc_range { + struct list_head list; + u64 start; + u64 len; +}; + +/* + * Helper function to add falloc range + * + * Caller should have locked the larger range of extent containing + * [start, len) + */ +static int add_falloc_range(struct list_head *head, u64 start, u64 len) +{ + struct falloc_range *range = NULL; + + if (!list_empty(head)) { + /* + * As fallocate iterates by bytenr order, we only need to check + * the last range. + */ + range = list_last_entry(head, struct falloc_range, list); + if (range->start + range->len == start) { + range->len += len; + return 0; + } + } + + range = kmalloc(sizeof(*range), GFP_KERNEL); + if (!range) + return -ENOMEM; + range->start = start; + range->len = len; + list_add_tail(&range->list, head); + return 0; +} + +static int btrfs_fallocate_update_isize(struct inode *inode, + const u64 end, + const int mode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + int ret2; + + if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) + return 0; + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + inode_set_ctime_current(inode); + i_size_write(inode, end); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret2 = btrfs_end_transaction(trans); + + return ret ? ret : ret2; +} + +enum { + RANGE_BOUNDARY_WRITTEN_EXTENT, + RANGE_BOUNDARY_PREALLOC_EXTENT, + RANGE_BOUNDARY_HOLE, +}; + +static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, + u64 offset) +{ + const u64 sectorsize = inode->root->fs_info->sectorsize; + struct extent_map *em; + int ret; + + offset = round_down(offset, sectorsize); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize); + if (IS_ERR(em)) + return PTR_ERR(em); + + if (em->block_start == EXTENT_MAP_HOLE) + ret = RANGE_BOUNDARY_HOLE; + else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + ret = RANGE_BOUNDARY_PREALLOC_EXTENT; + else + ret = RANGE_BOUNDARY_WRITTEN_EXTENT; + + free_extent_map(em); + return ret; +} + +static int btrfs_zero_range(struct inode *inode, + loff_t offset, + loff_t len, + const int mode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct extent_map *em; + struct extent_changeset *data_reserved = NULL; + int ret; + u64 alloc_hint = 0; + const u64 sectorsize = fs_info->sectorsize; + u64 alloc_start = round_down(offset, sectorsize); + u64 alloc_end = round_up(offset + len, sectorsize); + u64 bytes_to_reserve = 0; + bool space_reserved = false; + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, + alloc_end - alloc_start); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + /* + * Avoid hole punching and extent allocation for some cases. More cases + * could be considered, but these are unlikely common and we keep things + * as simple as possible for now. Also, intentionally, if the target + * range contains one or more prealloc extents together with regular + * extents and holes, we drop all the existing extents and allocate a + * new prealloc extent, so that we get a larger contiguous disk extent. + */ + if (em->start <= alloc_start && + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + const u64 em_end = em->start + em->len; + + if (em_end >= offset + len) { + /* + * The whole range is already a prealloc extent, + * do nothing except updating the inode's i_size if + * needed. + */ + free_extent_map(em); + ret = btrfs_fallocate_update_isize(inode, offset + len, + mode); + goto out; + } + /* + * Part of the range is already a prealloc extent, so operate + * only on the remaining part of the range. + */ + alloc_start = em_end; + ASSERT(IS_ALIGNED(alloc_start, sectorsize)); + len = offset + len - alloc_start; + offset = alloc_start; + alloc_hint = em->block_start + em->len; + } + free_extent_map(em); + + if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == + BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, + sectorsize); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + free_extent_map(em); + ret = btrfs_fallocate_update_isize(inode, offset + len, + mode); + goto out; + } + if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) { + free_extent_map(em); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, + 0); + if (!ret) + ret = btrfs_fallocate_update_isize(inode, + offset + len, + mode); + return ret; + } + free_extent_map(em); + alloc_start = round_down(offset, sectorsize); + alloc_end = alloc_start + sectorsize; + goto reserve_space; + } + + alloc_start = round_up(offset, sectorsize); + alloc_end = round_down(offset + len, sectorsize); + + /* + * For unaligned ranges, check the pages at the boundaries, they might + * map to an extent, in which case we need to partially zero them, or + * they might map to a hole, in which case we need our allocation range + * to cover them. + */ + if (!IS_ALIGNED(offset, sectorsize)) { + ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), + offset); + if (ret < 0) + goto out; + if (ret == RANGE_BOUNDARY_HOLE) { + alloc_start = round_down(offset, sectorsize); + ret = 0; + } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { + ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); + if (ret) + goto out; + } else { + ret = 0; + } + } + + if (!IS_ALIGNED(offset + len, sectorsize)) { + ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), + offset + len); + if (ret < 0) + goto out; + if (ret == RANGE_BOUNDARY_HOLE) { + alloc_end = round_up(offset + len, sectorsize); + ret = 0; + } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len, + 0, 1); + if (ret) + goto out; + } else { + ret = 0; + } + } + +reserve_space: + if (alloc_start < alloc_end) { + struct extent_state *cached_state = NULL; + const u64 lockstart = alloc_start; + const u64 lockend = alloc_end - 1; + + bytes_to_reserve = alloc_end - alloc_start; + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + bytes_to_reserve); + if (ret < 0) + goto out; + space_reserved = true; + btrfs_punch_hole_lock_range(inode, lockstart, lockend, + &cached_state); + ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, + alloc_start, bytes_to_reserve); + if (ret) { + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); + goto out; + } + ret = btrfs_prealloc_file_range(inode, mode, alloc_start, + alloc_end - alloc_start, + i_blocksize(inode), + offset + len, &alloc_hint); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); + /* btrfs_prealloc_file_range releases reserved space on error */ + if (ret) { + space_reserved = false; + goto out; + } + } + ret = btrfs_fallocate_update_isize(inode, offset + len, mode); + out: + if (ret && space_reserved) + btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, + alloc_start, bytes_to_reserve); + extent_changeset_free(data_reserved); + + return ret; +} + +static long btrfs_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; + struct falloc_range *range; + struct falloc_range *tmp; + LIST_HEAD(reserve_list); + u64 cur_offset; + u64 last_byte; + u64 alloc_start; + u64 alloc_end; + u64 alloc_hint = 0; + u64 locked_end; + u64 actual_end = 0; + u64 data_space_needed = 0; + u64 data_space_reserved = 0; + u64 qgroup_reserved = 0; + struct extent_map *em; + int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize; + int ret; + + /* Do not allow fallocate in ZONED mode */ + if (btrfs_is_zoned(btrfs_sb(inode->i_sb))) + return -EOPNOTSUPP; + + alloc_start = round_down(offset, blocksize); + alloc_end = round_up(offset + len, blocksize); + cur_offset = alloc_start; + + /* Make sure we aren't being give some crap mode */ + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_ZERO_RANGE)) + return -EOPNOTSUPP; + + if (mode & FALLOC_FL_PUNCH_HOLE) + return btrfs_punch_hole(file, offset, len); + + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { + ret = inode_newsize_ok(inode, offset + len); + if (ret) + goto out; + } + + ret = file_modified(file); + if (ret) + goto out; + + /* + * TODO: Move these two operations after we have checked + * accurate reserved space, or fallocate can still fail but + * with page truncated or size expanded. + * + * But that's a minor problem and won't do much harm BTW. + */ + if (alloc_start > inode->i_size) { + ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode), + alloc_start); + if (ret) + goto out; + } else if (offset + len > inode->i_size) { + /* + * If we are fallocating from the end of the file onward we + * need to zero out the end of the block if i_size lands in the + * middle of a block. + */ + ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); + if (ret) + goto out; + } + + /* + * We have locked the inode at the VFS level (in exclusive mode) and we + * have locked the i_mmap_lock lock (in exclusive mode). Now before + * locking the file range, flush all dealloc in the range and wait for + * all ordered extents in the range to complete. After this we can lock + * the file range and, due to the previous locking we did, we know there + * can't be more delalloc or ordered extents in the range. + */ + ret = btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + if (ret) + goto out; + + if (mode & FALLOC_FL_ZERO_RANGE) { + ret = btrfs_zero_range(inode, offset, len, mode); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + return ret; + } + + locked_end = alloc_end - 1; + lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); + + btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); + + /* First, check if we exceed the qgroup limit */ + while (cur_offset < alloc_end) { + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, + alloc_end - cur_offset); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + break; + } + last_byte = min(extent_map_end(em), alloc_end); + actual_end = min_t(u64, extent_map_end(em), offset + len); + last_byte = ALIGN(last_byte, blocksize); + if (em->block_start == EXTENT_MAP_HOLE || + (cur_offset >= inode->i_size && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + const u64 range_len = last_byte - cur_offset; + + ret = add_falloc_range(&reserve_list, cur_offset, range_len); + if (ret < 0) { + free_extent_map(em); + break; + } + ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), + &data_reserved, cur_offset, range_len); + if (ret < 0) { + free_extent_map(em); + break; + } + qgroup_reserved += range_len; + data_space_needed += range_len; + } + free_extent_map(em); + cur_offset = last_byte; + } + + if (!ret && data_space_needed > 0) { + /* + * We are safe to reserve space here as we can't have delalloc + * in the range, see above. + */ + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + data_space_needed); + if (!ret) + data_space_reserved = data_space_needed; + } + + /* + * If ret is still 0, means we're OK to fallocate. + * Or just cleanup the list and exit. + */ + list_for_each_entry_safe(range, tmp, &reserve_list, list) { + if (!ret) { + ret = btrfs_prealloc_file_range(inode, mode, + range->start, + range->len, i_blocksize(inode), + offset + len, &alloc_hint); + /* + * btrfs_prealloc_file_range() releases space even + * if it returns an error. + */ + data_space_reserved -= range->len; + qgroup_reserved -= range->len; + } else if (data_space_reserved > 0) { + btrfs_free_reserved_data_space(BTRFS_I(inode), + data_reserved, range->start, + range->len); + data_space_reserved -= range->len; + qgroup_reserved -= range->len; + } else if (qgroup_reserved > 0) { + btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, + range->start, range->len, NULL); + qgroup_reserved -= range->len; + } + list_del(&range->list); + kfree(range); + } + if (ret < 0) + goto out_unlock; + + /* + * We didn't need to allocate any more space, but we still extended the + * size of the file so we need to update i_size and the inode item. + */ + ret = btrfs_fallocate_update_isize(inode, actual_end, mode); +out_unlock: + unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); +out: + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + extent_changeset_free(data_reserved); + return ret; +} + +/* + * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range + * that has unflushed and/or flushing delalloc. There might be other adjacent + * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps + * looping while it gets adjacent subranges, and merging them together. + */ +static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state, + bool *search_io_tree, + u64 *delalloc_start_ret, u64 *delalloc_end_ret) +{ + u64 len = end + 1 - start; + u64 delalloc_len = 0; + struct btrfs_ordered_extent *oe; + u64 oe_start; + u64 oe_end; + + /* + * Search the io tree first for EXTENT_DELALLOC. If we find any, it + * means we have delalloc (dirty pages) for which writeback has not + * started yet. + */ + if (*search_io_tree) { + spin_lock(&inode->lock); + if (inode->delalloc_bytes > 0) { + spin_unlock(&inode->lock); + *delalloc_start_ret = start; + delalloc_len = count_range_bits(&inode->io_tree, + delalloc_start_ret, end, + len, EXTENT_DELALLOC, 1, + cached_state); + } else { + spin_unlock(&inode->lock); + } + } + + if (delalloc_len > 0) { + /* + * If delalloc was found then *delalloc_start_ret has a sector size + * aligned value (rounded down). + */ + *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1; + + if (*delalloc_start_ret == start) { + /* Delalloc for the whole range, nothing more to do. */ + if (*delalloc_end_ret == end) + return true; + /* Else trim our search range for ordered extents. */ + start = *delalloc_end_ret + 1; + len = end + 1 - start; + } + } else { + /* No delalloc, future calls don't need to search again. */ + *search_io_tree = false; + } + + /* + * Now also check if there's any ordered extent in the range. + * We do this because: + * + * 1) When delalloc is flushed, the file range is locked, we clear the + * EXTENT_DELALLOC bit from the io tree and create an extent map and + * an ordered extent for the write. So we might just have been called + * after delalloc is flushed and before the ordered extent completes + * and inserts the new file extent item in the subvolume's btree; + * + * 2) We may have an ordered extent created by flushing delalloc for a + * subrange that starts before the subrange we found marked with + * EXTENT_DELALLOC in the io tree. + * + * We could also use the extent map tree to find such delalloc that is + * being flushed, but using the ordered extents tree is more efficient + * because it's usually much smaller as ordered extents are removed from + * the tree once they complete. With the extent maps, we mau have them + * in the extent map tree for a very long time, and they were either + * created by previous writes or loaded by read operations. + */ + oe = btrfs_lookup_first_ordered_range(inode, start, len); + if (!oe) + return (delalloc_len > 0); + + /* The ordered extent may span beyond our search range. */ + oe_start = max(oe->file_offset, start); + oe_end = min(oe->file_offset + oe->num_bytes - 1, end); + + btrfs_put_ordered_extent(oe); + + /* Don't have unflushed delalloc, return the ordered extent range. */ + if (delalloc_len == 0) { + *delalloc_start_ret = oe_start; + *delalloc_end_ret = oe_end; + return true; + } + + /* + * We have both unflushed delalloc (io_tree) and an ordered extent. + * If the ranges are adjacent returned a combined range, otherwise + * return the leftmost range. + */ + if (oe_start < *delalloc_start_ret) { + if (oe_end < *delalloc_start_ret) + *delalloc_end_ret = oe_end; + *delalloc_start_ret = oe_start; + } else if (*delalloc_end_ret + 1 == oe_start) { + *delalloc_end_ret = oe_end; + } + + return true; +} + +/* + * Check if there's delalloc in a given range. + * + * @inode: The inode. + * @start: The start offset of the range. It does not need to be + * sector size aligned. + * @end: The end offset (inclusive value) of the search range. + * It does not need to be sector size aligned. + * @cached_state: Extent state record used for speeding up delalloc + * searches in the inode's io_tree. Can be NULL. + * @delalloc_start_ret: Output argument, set to the start offset of the + * subrange found with delalloc (may not be sector size + * aligned). + * @delalloc_end_ret: Output argument, set to he end offset (inclusive value) + * of the subrange found with delalloc. + * + * Returns true if a subrange with delalloc is found within the given range, and + * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and + * end offsets of the subrange. + */ +bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state, + u64 *delalloc_start_ret, u64 *delalloc_end_ret) +{ + u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize); + u64 prev_delalloc_end = 0; + bool search_io_tree = true; + bool ret = false; + + while (cur_offset <= end) { + u64 delalloc_start; + u64 delalloc_end; + bool delalloc; + + delalloc = find_delalloc_subrange(inode, cur_offset, end, + cached_state, &search_io_tree, + &delalloc_start, + &delalloc_end); + if (!delalloc) + break; + + if (prev_delalloc_end == 0) { + /* First subrange found. */ + *delalloc_start_ret = max(delalloc_start, start); + *delalloc_end_ret = delalloc_end; + ret = true; + } else if (delalloc_start == prev_delalloc_end + 1) { + /* Subrange adjacent to the previous one, merge them. */ + *delalloc_end_ret = delalloc_end; + } else { + /* Subrange not adjacent to the previous one, exit. */ + break; + } + + prev_delalloc_end = delalloc_end; + cur_offset = delalloc_end + 1; + cond_resched(); + } + + return ret; +} + +/* + * Check if there's a hole or delalloc range in a range representing a hole (or + * prealloc extent) found in the inode's subvolume btree. + * + * @inode: The inode. + * @whence: Seek mode (SEEK_DATA or SEEK_HOLE). + * @start: Start offset of the hole region. It does not need to be sector + * size aligned. + * @end: End offset (inclusive value) of the hole region. It does not + * need to be sector size aligned. + * @start_ret: Return parameter, used to set the start of the subrange in the + * hole that matches the search criteria (seek mode), if such + * subrange is found (return value of the function is true). + * The value returned here may not be sector size aligned. + * + * Returns true if a subrange matching the given seek mode is found, and if one + * is found, it updates @start_ret with the start of the subrange. + */ +static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, + struct extent_state **cached_state, + u64 start, u64 end, u64 *start_ret) +{ + u64 delalloc_start; + u64 delalloc_end; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state, + &delalloc_start, &delalloc_end); + if (delalloc && whence == SEEK_DATA) { + *start_ret = delalloc_start; + return true; + } + + if (delalloc && whence == SEEK_HOLE) { + /* + * We found delalloc but it starts after out start offset. So we + * have a hole between our start offset and the delalloc start. + */ + if (start < delalloc_start) { + *start_ret = start; + return true; + } + /* + * Delalloc range starts at our start offset. + * If the delalloc range's length is smaller than our range, + * then it means we have a hole that starts where the delalloc + * subrange ends. + */ + if (delalloc_end < end) { + *start_ret = delalloc_end + 1; + return true; + } + + /* There's delalloc for the whole range. */ + return false; + } + + if (!delalloc && whence == SEEK_HOLE) { + *start_ret = start; + return true; + } + + /* + * No delalloc in the range and we are seeking for data. The caller has + * to iterate to the next extent item in the subvolume btree. + */ + return false; +} + +static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) +{ + struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host); + struct btrfs_file_private *private = file->private_data; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_state *cached_state = NULL; + struct extent_state **delalloc_cached_state; + const loff_t i_size = i_size_read(&inode->vfs_inode); + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct btrfs_path *path; + struct btrfs_key key; + u64 last_extent_end; + u64 lockstart; + u64 lockend; + u64 start; + int ret; + bool found = false; + + if (i_size == 0 || offset >= i_size) + return -ENXIO; + + /* + * Quick path. If the inode has no prealloc extents and its number of + * bytes used matches its i_size, then it can not have holes. + */ + if (whence == SEEK_HOLE && + !(inode->flags & BTRFS_INODE_PREALLOC) && + inode_get_bytes(&inode->vfs_inode) == i_size) + return i_size; + + if (!private) { + private = kzalloc(sizeof(*private), GFP_KERNEL); + /* + * No worries if memory allocation failed. + * The private structure is used only for speeding up multiple + * lseek SEEK_HOLE/DATA calls to a file when there's delalloc, + * so everything will still be correct. + */ + file->private_data = private; + } + + if (private) + delalloc_cached_state = &private->llseek_cached_state; + else + delalloc_cached_state = NULL; + + /* + * offset can be negative, in this case we start finding DATA/HOLE from + * the very start of the file. + */ + start = max_t(loff_t, 0, offset); + + lockstart = round_down(start, fs_info->sectorsize); + lockend = round_up(i_size, fs_info->sectorsize); + if (lockend <= lockstart) + lockend = lockstart + fs_info->sectorsize; + lockend--; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = READA_FORWARD; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + last_extent_end = lockstart; + + lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + while (start < i_size) { + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *extent; + u64 extent_end; + u8 type; + + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + break; + + extent_end = btrfs_file_extent_end(path); + + /* + * In the first iteration we may have a slot that points to an + * extent that ends before our start offset, so skip it. + */ + if (extent_end <= start) { + path->slots[0]++; + continue; + } + + /* We have an implicit hole, NO_HOLES feature is likely set. */ + if (last_extent_end < key.offset) { + u64 search_start = last_extent_end; + u64 found_start; + + /* + * First iteration, @start matches @offset and it's + * within the hole. + */ + if (start == offset) + search_start = offset; + + found = find_desired_extent_in_hole(inode, whence, + delalloc_cached_state, + search_start, + key.offset - 1, + &found_start); + if (found) { + start = found_start; + break; + } + /* + * Didn't find data or a hole (due to delalloc) in the + * implicit hole range, so need to analyze the extent. + */ + } + + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(leaf, extent); + + /* + * Can't access the extent's disk_bytenr field if this is an + * inline extent, since at that offset, it's where the extent + * data starts. + */ + if (type == BTRFS_FILE_EXTENT_PREALLOC || + (type == BTRFS_FILE_EXTENT_REG && + btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) { + /* + * Explicit hole or prealloc extent, search for delalloc. + * A prealloc extent is treated like a hole. + */ + u64 search_start = key.offset; + u64 found_start; + + /* + * First iteration, @start matches @offset and it's + * within the hole. + */ + if (start == offset) + search_start = offset; + + found = find_desired_extent_in_hole(inode, whence, + delalloc_cached_state, + search_start, + extent_end - 1, + &found_start); + if (found) { + start = found_start; + break; + } + /* + * Didn't find data or a hole (due to delalloc) in the + * implicit hole range, so need to analyze the next + * extent item. + */ + } else { + /* + * Found a regular or inline extent. + * If we are seeking for data, adjust the start offset + * and stop, we're done. + */ + if (whence == SEEK_DATA) { + start = max_t(u64, key.offset, offset); + found = true; + break; + } + /* + * Else, we are seeking for a hole, check the next file + * extent item. + */ + } + + start = extent_end; + last_extent_end = extent_end; + path->slots[0]++; + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + cond_resched(); + } + + /* We have an implicit hole from the last extent found up to i_size. */ + if (!found && start < i_size) { + found = find_desired_extent_in_hole(inode, whence, + delalloc_cached_state, start, + i_size - 1, &start); + if (!found) + start = i_size; + } + +out: + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_free_path(path); + + if (ret < 0) + return ret; + + if (whence == SEEK_DATA && start >= i_size) + return -ENXIO; + + return min_t(loff_t, start, i_size); +} + +static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + + switch (whence) { + default: + return generic_file_llseek(file, offset, whence); + case SEEK_DATA: + case SEEK_HOLE: + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); + offset = find_desired_extent(file, offset, whence); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); + break; + } + + if (offset < 0) + return offset; + + return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); +} + +static int btrfs_file_open(struct inode *inode, struct file *filp) +{ + int ret; + + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | + FMODE_CAN_ODIRECT; + + ret = fsverity_file_open(inode, filp); + if (ret) + return ret; + return generic_file_open(inode, filp); +} + +static int check_direct_read(struct btrfs_fs_info *fs_info, + const struct iov_iter *iter, loff_t offset) +{ + int ret; + int i, seg; + + ret = check_direct_IO(fs_info, iter, offset); + if (ret < 0) + return ret; + + if (!iter_is_iovec(iter)) + return 0; + + for (seg = 0; seg < iter->nr_segs; seg++) { + for (i = seg + 1; i < iter->nr_segs; i++) { + const struct iovec *iov1 = iter_iov(iter) + seg; + const struct iovec *iov2 = iter_iov(iter) + i; + + if (iov1->iov_base == iov2->iov_base) + return -EINVAL; + } + } + return 0; +} + +static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + size_t prev_left = 0; + ssize_t read = 0; + ssize_t ret; + + if (fsverity_active(inode)) + return 0; + + if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) + return 0; + + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); +again: + /* + * This is similar to what we do for direct IO writes, see the comment + * at btrfs_direct_write(), but we also disable page faults in addition + * to disabling them only at the iov_iter level. This is because when + * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), + * which can still trigger page fault ins despite having set ->nofault + * to true of our 'to' iov_iter. + * + * The difference to direct IO writes is that we deadlock when trying + * to lock the extent range in the inode's tree during he page reads + * triggered by the fault in (while for writes it is due to waiting for + * our own ordered extent). This is because for direct IO reads, + * btrfs_dio_iomap_begin() returns with the extent range locked, which + * is only unlocked in the endio callback (end_bio_extent_readpage()). + */ + pagefault_disable(); + to->nofault = true; + ret = btrfs_dio_read(iocb, to, read); + to->nofault = false; + pagefault_enable(); + + /* No increment (+=) because iomap returns a cumulative value. */ + if (ret > 0) + read = ret; + + if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { + const size_t left = iov_iter_count(to); + + if (left == prev_left) { + /* + * We didn't make any progress since the last attempt, + * fallback to a buffered read for the remainder of the + * range. This is just to avoid any possibility of looping + * for too long. + */ + ret = read; + } else { + /* + * We made some progress since the last retry or this is + * the first time we are retrying. Fault in as many pages + * as possible and retry. + */ + fault_in_iov_iter_writeable(to, left); + prev_left = left; + goto again; + } + } + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); + return ret < 0 ? ret : read; +} + +static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret = 0; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = btrfs_direct_read(iocb, to); + if (ret < 0 || !iov_iter_count(to) || + iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp))) + return ret; + } + + return filemap_read(iocb, to, ret); +} + +const struct file_operations btrfs_file_operations = { + .llseek = btrfs_file_llseek, + .read_iter = btrfs_file_read_iter, + .splice_read = filemap_splice_read, + .write_iter = btrfs_file_write_iter, + .splice_write = iter_file_splice_write, + .mmap = btrfs_file_mmap, + .open = btrfs_file_open, + .release = btrfs_release_file, + .get_unmapped_area = thp_get_unmapped_area, + .fsync = btrfs_sync_file, + .fallocate = btrfs_fallocate, + .unlocked_ioctl = btrfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = btrfs_compat_ioctl, +#endif + .remap_file_range = btrfs_remap_file_range, +}; + +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) +{ + int ret; + + /* + * So with compression we will find and lock a dirty page and clear the + * first one as dirty, setup an async extent, and immediately return + * with the entire range locked but with nobody actually marked with + * writeback. So we can't just filemap_write_and_wait_range() and + * expect it to work since it will just kick off a thread to do the + * actual work. So we need to call filemap_fdatawrite_range _again_ + * since it will wait on the page lock, which won't be unlocked until + * after the pages have been marked as writeback and so we're good to go + * from there. We have to do this otherwise we'll miss the ordered + * extents and that results in badness. Please Josef, do not think you + * know better and pull this out at some point in the future, it is + * right and you are wrong. + */ + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + + return ret; +} diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h new file mode 100644 index 0000000000..82b34fbb29 --- /dev/null +++ b/fs/btrfs/file.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_FILE_H +#define BTRFS_FILE_H + +extern const struct file_operations btrfs_file_operations; + +int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_drop_extents_args *args); +int btrfs_replace_file_extents(struct btrfs_inode *inode, + struct btrfs_path *path, const u64 start, + const u64 end, + struct btrfs_replace_extent_info *extent_info, + struct btrfs_trans_handle **trans_out); +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u64 start, u64 end); +ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); +int btrfs_release_file(struct inode *inode, struct file *file); +int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, + size_t num_pages, loff_t pos, size_t write_bytes, + struct extent_state **cached, bool noreserve); +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); +int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes, bool nowait); +void btrfs_check_nocow_unlock(struct btrfs_inode *inode); +bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state, + u64 *delalloc_start_ret, u64 *delalloc_end_ret); + +#endif diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c new file mode 100644 index 0000000000..8dd8ef7603 --- /dev/null +++ b/fs/btrfs/free-space-cache.c @@ -0,0 +1,4334 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2008 Red Hat. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "fs.h" +#include "messages.h" +#include "misc.h" +#include "free-space-cache.h" +#include "transaction.h" +#include "disk-io.h" +#include "extent_io.h" +#include "volumes.h" +#include "space-info.h" +#include "delalloc-space.h" +#include "block-group.h" +#include "discard.h" +#include "subpage.h" +#include "inode-item.h" +#include "accessors.h" +#include "file-item.h" +#include "file.h" +#include "super.h" + +#define BITS_PER_BITMAP (PAGE_SIZE * 8UL) +#define MAX_CACHE_BYTES_PER_GIG SZ_64K +#define FORCE_EXTENT_THRESHOLD SZ_1M + +static struct kmem_cache *btrfs_free_space_cachep; +static struct kmem_cache *btrfs_free_space_bitmap_cachep; + +struct btrfs_trim_range { + u64 start; + u64 bytes; + struct list_head list; +}; + +static int link_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info); +static void unlink_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, bool update_stat); +static int search_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info, u64 *offset, + u64 *bytes, bool for_alloc); +static void free_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info); +static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes, bool update_stats); + +static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *info; + struct rb_node *node; + + while ((node = rb_last(&ctl->free_space_offset)) != NULL) { + info = rb_entry(node, struct btrfs_free_space, offset_index); + if (!info->bitmap) { + unlink_free_space(ctl, info, true); + kmem_cache_free(btrfs_free_space_cachep, info); + } else { + free_bitmap(ctl, info); + } + + cond_resched_lock(&ctl->tree_lock); + } +} + +static struct inode *__lookup_free_space_inode(struct btrfs_root *root, + struct btrfs_path *path, + u64 offset) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key key; + struct btrfs_key location; + struct btrfs_disk_key disk_key; + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + struct inode *inode = NULL; + unsigned nofs_flag; + int ret; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + btrfs_release_path(path); + return ERR_PTR(-ENOENT); + } + + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + btrfs_free_space_key(leaf, header, &disk_key); + btrfs_disk_key_to_cpu(&location, &disk_key); + btrfs_release_path(path); + + /* + * We are often under a trans handle at this point, so we need to make + * sure NOFS is set to keep us from deadlocking. + */ + nofs_flag = memalloc_nofs_save(); + inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path); + btrfs_release_path(path); + memalloc_nofs_restore(nofs_flag); + if (IS_ERR(inode)) + return inode; + + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_constraint(inode->i_mapping, + ~(__GFP_FS | __GFP_HIGHMEM))); + + return inode; +} + +struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct inode *inode = NULL; + u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; + + spin_lock(&block_group->lock); + if (block_group->inode) + inode = igrab(block_group->inode); + spin_unlock(&block_group->lock); + if (inode) + return inode; + + inode = __lookup_free_space_inode(fs_info->tree_root, path, + block_group->start); + if (IS_ERR(inode)) + return inode; + + spin_lock(&block_group->lock); + if (!((BTRFS_I(inode)->flags & flags) == flags)) { + btrfs_info(fs_info, "Old style space inode found, converting."); + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | + BTRFS_INODE_NODATACOW; + block_group->disk_cache_state = BTRFS_DC_CLEAR; + } + + if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) + block_group->inode = igrab(inode); + spin_unlock(&block_group->lock); + + return inode; +} + +static int __create_free_space_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 ino, u64 offset) +{ + struct btrfs_key key; + struct btrfs_disk_key disk_key; + struct btrfs_free_space_header *header; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + /* We inline CRCs for the free disk space cache */ + const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | + BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; + int ret; + + ret = btrfs_insert_empty_inode(trans, root, path, ino); + if (ret) + return ret; + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + btrfs_item_key(leaf, &disk_key, path->slots[0]); + memzero_extent_buffer(leaf, (unsigned long)inode_item, + sizeof(*inode_item)); + btrfs_set_inode_generation(leaf, inode_item, trans->transid); + btrfs_set_inode_size(leaf, inode_item, 0); + btrfs_set_inode_nbytes(leaf, inode_item, 0); + btrfs_set_inode_uid(leaf, inode_item, 0); + btrfs_set_inode_gid(leaf, inode_item, 0); + btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, inode_item, flags); + btrfs_set_inode_nlink(leaf, inode_item, 1); + btrfs_set_inode_transid(leaf, inode_item, trans->transid); + btrfs_set_inode_block_group(leaf, inode_item, offset); + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_release_path(path); + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_free_space_header)); + if (ret < 0) { + btrfs_release_path(path); + return ret; + } + + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); + btrfs_set_free_space_key(leaf, header, &disk_key); + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_release_path(path); + + return 0; +} + +int create_free_space_inode(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path) +{ + int ret; + u64 ino; + + ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); + if (ret < 0) + return ret; + + return __create_free_space_inode(trans->fs_info->tree_root, trans, path, + ino, block_group->start); +} + +/* + * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode + * handles lookup, otherwise it takes ownership and iputs the inode. + * Don't reuse an inode pointer after passing it into this function. + */ +int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_block_group *block_group) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (!inode) + inode = lookup_free_space_inode(block_group, path); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) != -ENOENT) + ret = PTR_ERR(inode); + goto out; + } + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (ret) { + btrfs_add_delayed_iput(BTRFS_I(inode)); + goto out; + } + clear_nlink(inode); + /* One for the block groups ref */ + spin_lock(&block_group->lock); + if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { + block_group->inode = NULL; + spin_unlock(&block_group->lock); + iput(inode); + } else { + spin_unlock(&block_group->lock); + } + /* One for the lookup ref */ + btrfs_add_delayed_iput(BTRFS_I(inode)); + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.type = 0; + key.offset = block_group->start; + ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, + -1, 1); + if (ret) { + if (ret > 0) + ret = 0; + goto out; + } + ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct inode *vfs_inode) +{ + struct btrfs_truncate_control control = { + .inode = BTRFS_I(vfs_inode), + .new_size = 0, + .ino = btrfs_ino(BTRFS_I(vfs_inode)), + .min_type = BTRFS_EXTENT_DATA_KEY, + .clear_extent_range = true, + }; + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_root *root = inode->root; + struct extent_state *cached_state = NULL; + int ret = 0; + bool locked = false; + + if (block_group) { + struct btrfs_path *path = btrfs_alloc_path(); + + if (!path) { + ret = -ENOMEM; + goto fail; + } + locked = true; + mutex_lock(&trans->transaction->cache_write_mutex); + if (!list_empty(&block_group->io_list)) { + list_del_init(&block_group->io_list); + + btrfs_wait_cache_io(trans, block_group, path); + btrfs_put_block_group(block_group); + } + + /* + * now that we've truncated the cache away, its no longer + * setup or written + */ + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_CLEAR; + spin_unlock(&block_group->lock); + btrfs_free_path(path); + } + + btrfs_i_size_write(inode, 0); + truncate_pagecache(vfs_inode, 0); + + lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); + btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); + + /* + * We skip the throttling logic for free space cache inodes, so we don't + * need to check for -EAGAIN. + */ + ret = btrfs_truncate_inode_items(trans, root, &control); + + inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); + btrfs_inode_safe_disk_i_size_write(inode, control.last_size); + + unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); + if (ret) + goto fail; + + ret = btrfs_update_inode(trans, root, inode); + +fail: + if (locked) + mutex_unlock(&trans->transaction->cache_write_mutex); + if (ret) + btrfs_abort_transaction(trans, ret); + + return ret; +} + +static void readahead_cache(struct inode *inode) +{ + struct file_ra_state ra; + unsigned long last_index; + + file_ra_state_init(&ra, inode->i_mapping); + last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; + + page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index); +} + +static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, + int write) +{ + int num_pages; + + num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + /* Make sure we can fit our crcs and generation into the first page */ + if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) + return -ENOSPC; + + memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); + + io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); + if (!io_ctl->pages) + return -ENOMEM; + + io_ctl->num_pages = num_pages; + io_ctl->fs_info = btrfs_sb(inode->i_sb); + io_ctl->inode = inode; + + return 0; +} +ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); + +static void io_ctl_free(struct btrfs_io_ctl *io_ctl) +{ + kfree(io_ctl->pages); + io_ctl->pages = NULL; +} + +static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) +{ + if (io_ctl->cur) { + io_ctl->cur = NULL; + io_ctl->orig = NULL; + } +} + +static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) +{ + ASSERT(io_ctl->index < io_ctl->num_pages); + io_ctl->page = io_ctl->pages[io_ctl->index++]; + io_ctl->cur = page_address(io_ctl->page); + io_ctl->orig = io_ctl->cur; + io_ctl->size = PAGE_SIZE; + if (clear) + clear_page(io_ctl->cur); +} + +static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) +{ + int i; + + io_ctl_unmap_page(io_ctl); + + for (i = 0; i < io_ctl->num_pages; i++) { + if (io_ctl->pages[i]) { + btrfs_page_clear_checked(io_ctl->fs_info, + io_ctl->pages[i], + page_offset(io_ctl->pages[i]), + PAGE_SIZE); + unlock_page(io_ctl->pages[i]); + put_page(io_ctl->pages[i]); + } + } +} + +static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) +{ + struct page *page; + struct inode *inode = io_ctl->inode; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + int i; + + for (i = 0; i < io_ctl->num_pages; i++) { + int ret; + + page = find_or_create_page(inode->i_mapping, i, mask); + if (!page) { + io_ctl_drop_pages(io_ctl); + return -ENOMEM; + } + + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + io_ctl_drop_pages(io_ctl); + return ret; + } + + io_ctl->pages[i] = page; + if (uptodate && !PageUptodate(page)) { + btrfs_read_folio(NULL, page_folio(page)); + lock_page(page); + if (page->mapping != inode->i_mapping) { + btrfs_err(BTRFS_I(inode)->root->fs_info, + "free space cache page truncated"); + io_ctl_drop_pages(io_ctl); + return -EIO; + } + if (!PageUptodate(page)) { + btrfs_err(BTRFS_I(inode)->root->fs_info, + "error reading free space cache"); + io_ctl_drop_pages(io_ctl); + return -EIO; + } + } + } + + for (i = 0; i < io_ctl->num_pages; i++) + clear_page_dirty_for_io(io_ctl->pages[i]); + + return 0; +} + +static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) +{ + io_ctl_map_page(io_ctl, 1); + + /* + * Skip the csum areas. If we don't check crcs then we just have a + * 64bit chunk at the front of the first page. + */ + io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); + io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); + + put_unaligned_le64(generation, io_ctl->cur); + io_ctl->cur += sizeof(u64); +} + +static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) +{ + u64 cache_gen; + + /* + * Skip the crc area. If we don't check crcs then we just have a 64bit + * chunk at the front of the first page. + */ + io_ctl->cur += sizeof(u32) * io_ctl->num_pages; + io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); + + cache_gen = get_unaligned_le64(io_ctl->cur); + if (cache_gen != generation) { + btrfs_err_rl(io_ctl->fs_info, + "space cache generation (%llu) does not match inode (%llu)", + cache_gen, generation); + io_ctl_unmap_page(io_ctl); + return -EIO; + } + io_ctl->cur += sizeof(u64); + return 0; +} + +static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) +{ + u32 *tmp; + u32 crc = ~(u32)0; + unsigned offset = 0; + + if (index == 0) + offset = sizeof(u32) * io_ctl->num_pages; + + crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + btrfs_crc32c_final(crc, (u8 *)&crc); + io_ctl_unmap_page(io_ctl); + tmp = page_address(io_ctl->pages[0]); + tmp += index; + *tmp = crc; +} + +static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) +{ + u32 *tmp, val; + u32 crc = ~(u32)0; + unsigned offset = 0; + + if (index == 0) + offset = sizeof(u32) * io_ctl->num_pages; + + tmp = page_address(io_ctl->pages[0]); + tmp += index; + val = *tmp; + + io_ctl_map_page(io_ctl, 0); + crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + btrfs_crc32c_final(crc, (u8 *)&crc); + if (val != crc) { + btrfs_err_rl(io_ctl->fs_info, + "csum mismatch on free space cache"); + io_ctl_unmap_page(io_ctl); + return -EIO; + } + + return 0; +} + +static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, + void *bitmap) +{ + struct btrfs_free_space_entry *entry; + + if (!io_ctl->cur) + return -ENOSPC; + + entry = io_ctl->cur; + put_unaligned_le64(offset, &entry->offset); + put_unaligned_le64(bytes, &entry->bytes); + entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : + BTRFS_FREE_SPACE_EXTENT; + io_ctl->cur += sizeof(struct btrfs_free_space_entry); + io_ctl->size -= sizeof(struct btrfs_free_space_entry); + + if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) + return 0; + + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + + /* No more pages to map */ + if (io_ctl->index >= io_ctl->num_pages) + return 0; + + /* map the next page */ + io_ctl_map_page(io_ctl, 1); + return 0; +} + +static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) +{ + if (!io_ctl->cur) + return -ENOSPC; + + /* + * If we aren't at the start of the current page, unmap this one and + * map the next one if there is any left. + */ + if (io_ctl->cur != io_ctl->orig) { + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + if (io_ctl->index >= io_ctl->num_pages) + return -ENOSPC; + io_ctl_map_page(io_ctl, 0); + } + + copy_page(io_ctl->cur, bitmap); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + if (io_ctl->index < io_ctl->num_pages) + io_ctl_map_page(io_ctl, 0); + return 0; +} + +static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) +{ + /* + * If we're not on the boundary we know we've modified the page and we + * need to crc the page. + */ + if (io_ctl->cur != io_ctl->orig) + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + else + io_ctl_unmap_page(io_ctl); + + while (io_ctl->index < io_ctl->num_pages) { + io_ctl_map_page(io_ctl, 1); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + } +} + +static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, + struct btrfs_free_space *entry, u8 *type) +{ + struct btrfs_free_space_entry *e; + int ret; + + if (!io_ctl->cur) { + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + } + + e = io_ctl->cur; + entry->offset = get_unaligned_le64(&e->offset); + entry->bytes = get_unaligned_le64(&e->bytes); + *type = e->type; + io_ctl->cur += sizeof(struct btrfs_free_space_entry); + io_ctl->size -= sizeof(struct btrfs_free_space_entry); + + if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) + return 0; + + io_ctl_unmap_page(io_ctl); + + return 0; +} + +static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, + struct btrfs_free_space *entry) +{ + int ret; + + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + + copy_page(entry->bitmap, io_ctl->cur); + io_ctl_unmap_page(io_ctl); + + return 0; +} + +static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_block_group *block_group = ctl->block_group; + u64 max_bytes; + u64 bitmap_bytes; + u64 extent_bytes; + u64 size = block_group->length; + u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; + u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); + + max_bitmaps = max_t(u64, max_bitmaps, 1); + + if (ctl->total_bitmaps > max_bitmaps) + btrfs_err(block_group->fs_info, +"invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", + block_group->start, block_group->length, + ctl->total_bitmaps, ctl->unit, max_bitmaps, + bytes_per_bg); + ASSERT(ctl->total_bitmaps <= max_bitmaps); + + /* + * We are trying to keep the total amount of memory used per 1GiB of + * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation + * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of + * bitmaps, we may end up using more memory than this. + */ + if (size < SZ_1G) + max_bytes = MAX_CACHE_BYTES_PER_GIG; + else + max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); + + bitmap_bytes = ctl->total_bitmaps * ctl->unit; + + /* + * we want the extent entry threshold to always be at most 1/2 the max + * bytes we can have, or whatever is less than that. + */ + extent_bytes = max_bytes - bitmap_bytes; + extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); + + ctl->extents_thresh = + div_u64(extent_bytes, sizeof(struct btrfs_free_space)); +} + +static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, + struct btrfs_free_space_ctl *ctl, + struct btrfs_path *path, u64 offset) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + struct btrfs_io_ctl io_ctl; + struct btrfs_key key; + struct btrfs_free_space *e, *n; + LIST_HEAD(bitmaps); + u64 num_entries; + u64 num_bitmaps; + u64 generation; + u8 type; + int ret = 0; + + /* Nothing in the space cache, goodbye */ + if (!i_size_read(inode)) + return 0; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return 0; + else if (ret > 0) { + btrfs_release_path(path); + return 0; + } + + ret = -1; + + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + num_entries = btrfs_free_space_entries(leaf, header); + num_bitmaps = btrfs_free_space_bitmaps(leaf, header); + generation = btrfs_free_space_generation(leaf, header); + btrfs_release_path(path); + + if (!BTRFS_I(inode)->generation) { + btrfs_info(fs_info, + "the free space cache file (%llu) is invalid, skip it", + offset); + return 0; + } + + if (BTRFS_I(inode)->generation != generation) { + btrfs_err(fs_info, + "free space inode generation (%llu) did not match free space cache generation (%llu)", + BTRFS_I(inode)->generation, generation); + return 0; + } + + if (!num_entries) + return 0; + + ret = io_ctl_init(&io_ctl, inode, 0); + if (ret) + return ret; + + readahead_cache(inode); + + ret = io_ctl_prepare_pages(&io_ctl, true); + if (ret) + goto out; + + ret = io_ctl_check_crc(&io_ctl, 0); + if (ret) + goto free_cache; + + ret = io_ctl_check_generation(&io_ctl, generation); + if (ret) + goto free_cache; + + while (num_entries) { + e = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); + if (!e) { + ret = -ENOMEM; + goto free_cache; + } + + ret = io_ctl_read_entry(&io_ctl, e, &type); + if (ret) { + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; + } + + if (!e->bytes) { + ret = -1; + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; + } + + if (type == BTRFS_FREE_SPACE_EXTENT) { + spin_lock(&ctl->tree_lock); + ret = link_free_space(ctl, e); + spin_unlock(&ctl->tree_lock); + if (ret) { + btrfs_err(fs_info, + "Duplicate entries in free space cache, dumping"); + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; + } + } else { + ASSERT(num_bitmaps); + num_bitmaps--; + e->bitmap = kmem_cache_zalloc( + btrfs_free_space_bitmap_cachep, GFP_NOFS); + if (!e->bitmap) { + ret = -ENOMEM; + kmem_cache_free( + btrfs_free_space_cachep, e); + goto free_cache; + } + spin_lock(&ctl->tree_lock); + ret = link_free_space(ctl, e); + if (ret) { + spin_unlock(&ctl->tree_lock); + btrfs_err(fs_info, + "Duplicate entries in free space cache, dumping"); + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; + } + ctl->total_bitmaps++; + recalculate_thresholds(ctl); + spin_unlock(&ctl->tree_lock); + list_add_tail(&e->list, &bitmaps); + } + + num_entries--; + } + + io_ctl_unmap_page(&io_ctl); + + /* + * We add the bitmaps at the end of the entries in order that + * the bitmap entries are added to the cache. + */ + list_for_each_entry_safe(e, n, &bitmaps, list) { + list_del_init(&e->list); + ret = io_ctl_read_bitmap(&io_ctl, e); + if (ret) + goto free_cache; + } + + io_ctl_drop_pages(&io_ctl); + ret = 1; +out: + io_ctl_free(&io_ctl); + return ret; +free_cache: + io_ctl_drop_pages(&io_ctl); + + spin_lock(&ctl->tree_lock); + __btrfs_remove_free_space_cache(ctl); + spin_unlock(&ctl->tree_lock); + goto out; +} + +static int copy_free_space_cache(struct btrfs_block_group *block_group, + struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *info; + struct rb_node *n; + int ret = 0; + + while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + if (!info->bitmap) { + const u64 offset = info->offset; + const u64 bytes = info->bytes; + + unlink_free_space(ctl, info, true); + spin_unlock(&ctl->tree_lock); + kmem_cache_free(btrfs_free_space_cachep, info); + ret = btrfs_add_free_space(block_group, offset, bytes); + spin_lock(&ctl->tree_lock); + } else { + u64 offset = info->offset; + u64 bytes = ctl->unit; + + ret = search_bitmap(ctl, info, &offset, &bytes, false); + if (ret == 0) { + bitmap_clear_bits(ctl, info, offset, bytes, true); + spin_unlock(&ctl->tree_lock); + ret = btrfs_add_free_space(block_group, offset, + bytes); + spin_lock(&ctl->tree_lock); + } else { + free_bitmap(ctl, info); + ret = 0; + } + } + cond_resched_lock(&ctl->tree_lock); + } + return ret; +} + +static struct lock_class_key btrfs_free_space_inode_key; + +int load_free_space_cache(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space_ctl tmp_ctl = {}; + struct inode *inode; + struct btrfs_path *path; + int ret = 0; + bool matched; + u64 used = block_group->used; + + /* + * Because we could potentially discard our loaded free space, we want + * to load everything into a temporary structure first, and then if it's + * valid copy it all into the actual free space ctl. + */ + btrfs_init_free_space_ctl(block_group, &tmp_ctl); + + /* + * If this block group has been marked to be cleared for one reason or + * another then we can't trust the on disk cache, so just return. + */ + spin_lock(&block_group->lock); + if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { + spin_unlock(&block_group->lock); + return 0; + } + spin_unlock(&block_group->lock); + + path = btrfs_alloc_path(); + if (!path) + return 0; + path->search_commit_root = 1; + path->skip_locking = 1; + + /* + * We must pass a path with search_commit_root set to btrfs_iget in + * order to avoid a deadlock when allocating extents for the tree root. + * + * When we are COWing an extent buffer from the tree root, when looking + * for a free extent, at extent-tree.c:find_free_extent(), we can find + * block group without its free space cache loaded. When we find one + * we must load its space cache which requires reading its free space + * cache's inode item from the root tree. If this inode item is located + * in the same leaf that we started COWing before, then we end up in + * deadlock on the extent buffer (trying to read lock it when we + * previously write locked it). + * + * It's safe to read the inode item using the commit root because + * block groups, once loaded, stay in memory forever (until they are + * removed) as well as their space caches once loaded. New block groups + * once created get their ->cached field set to BTRFS_CACHE_FINISHED so + * we will never try to read their inode item while the fs is mounted. + */ + inode = lookup_free_space_inode(block_group, path); + if (IS_ERR(inode)) { + btrfs_free_path(path); + return 0; + } + + /* We may have converted the inode and made the cache invalid. */ + spin_lock(&block_group->lock); + if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { + spin_unlock(&block_group->lock); + btrfs_free_path(path); + goto out; + } + spin_unlock(&block_group->lock); + + /* + * Reinitialize the class of struct inode's mapping->invalidate_lock for + * free space inodes to prevent false positives related to locks for normal + * inodes. + */ + lockdep_set_class(&(&inode->i_data)->invalidate_lock, + &btrfs_free_space_inode_key); + + ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, + path, block_group->start); + btrfs_free_path(path); + if (ret <= 0) + goto out; + + matched = (tmp_ctl.free_space == (block_group->length - used - + block_group->bytes_super)); + + if (matched) { + spin_lock(&tmp_ctl.tree_lock); + ret = copy_free_space_cache(block_group, &tmp_ctl); + spin_unlock(&tmp_ctl.tree_lock); + /* + * ret == 1 means we successfully loaded the free space cache, + * so we need to re-set it here. + */ + if (ret == 0) + ret = 1; + } else { + /* + * We need to call the _locked variant so we don't try to update + * the discard counters. + */ + spin_lock(&tmp_ctl.tree_lock); + __btrfs_remove_free_space_cache(&tmp_ctl); + spin_unlock(&tmp_ctl.tree_lock); + btrfs_warn(fs_info, + "block group %llu has wrong amount of free space", + block_group->start); + ret = -1; + } +out: + if (ret < 0) { + /* This cache is bogus, make sure it gets cleared */ + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_CLEAR; + spin_unlock(&block_group->lock); + ret = 0; + + btrfs_warn(fs_info, + "failed to load free space cache for block group %llu, rebuilding it now", + block_group->start); + } + + spin_lock(&ctl->tree_lock); + btrfs_discard_update_discardable(block_group); + spin_unlock(&ctl->tree_lock); + iput(inode); + return ret; +} + +static noinline_for_stack +int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, + struct btrfs_free_space_ctl *ctl, + struct btrfs_block_group *block_group, + int *entries, int *bitmaps, + struct list_head *bitmap_list) +{ + int ret; + struct btrfs_free_cluster *cluster = NULL; + struct btrfs_free_cluster *cluster_locked = NULL; + struct rb_node *node = rb_first(&ctl->free_space_offset); + struct btrfs_trim_range *trim_entry; + + /* Get the cluster for this block_group if it exists */ + if (block_group && !list_empty(&block_group->cluster_list)) { + cluster = list_entry(block_group->cluster_list.next, + struct btrfs_free_cluster, + block_group_list); + } + + if (!node && cluster) { + cluster_locked = cluster; + spin_lock(&cluster_locked->lock); + node = rb_first(&cluster->root); + cluster = NULL; + } + + /* Write out the extent entries */ + while (node) { + struct btrfs_free_space *e; + + e = rb_entry(node, struct btrfs_free_space, offset_index); + *entries += 1; + + ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, + e->bitmap); + if (ret) + goto fail; + + if (e->bitmap) { + list_add_tail(&e->list, bitmap_list); + *bitmaps += 1; + } + node = rb_next(node); + if (!node && cluster) { + node = rb_first(&cluster->root); + cluster_locked = cluster; + spin_lock(&cluster_locked->lock); + cluster = NULL; + } + } + if (cluster_locked) { + spin_unlock(&cluster_locked->lock); + cluster_locked = NULL; + } + + /* + * Make sure we don't miss any range that was removed from our rbtree + * because trimming is running. Otherwise after a umount+mount (or crash + * after committing the transaction) we would leak free space and get + * an inconsistent free space cache report from fsck. + */ + list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { + ret = io_ctl_add_entry(io_ctl, trim_entry->start, + trim_entry->bytes, NULL); + if (ret) + goto fail; + *entries += 1; + } + + return 0; +fail: + if (cluster_locked) + spin_unlock(&cluster_locked->lock); + return -ENOSPC; +} + +static noinline_for_stack int +update_cache_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, u64 offset, + int entries, int bitmaps) +{ + struct btrfs_key key; + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + int ret; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DELALLOC, NULL); + goto fail; + } + leaf = path->nodes[0]; + if (ret > 0) { + struct btrfs_key found_key; + ASSERT(path->slots[0]); + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || + found_key.offset != offset) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, + inode->i_size - 1, EXTENT_DELALLOC, + NULL); + btrfs_release_path(path); + goto fail; + } + } + + BTRFS_I(inode)->generation = trans->transid; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + btrfs_set_free_space_entries(leaf, header, entries); + btrfs_set_free_space_bitmaps(leaf, header, bitmaps); + btrfs_set_free_space_generation(leaf, header, trans->transid); + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_release_path(path); + + return 0; + +fail: + return -1; +} + +static noinline_for_stack int write_pinned_extent_entries( + struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_io_ctl *io_ctl, + int *entries) +{ + u64 start, extent_start, extent_end, len; + struct extent_io_tree *unpin = NULL; + int ret; + + if (!block_group) + return 0; + + /* + * We want to add any pinned extents to our free space cache + * so we don't leak the space + * + * We shouldn't have switched the pinned extents yet so this is the + * right one + */ + unpin = &trans->transaction->pinned_extents; + + start = block_group->start; + + while (start < block_group->start + block_group->length) { + if (!find_first_extent_bit(unpin, start, + &extent_start, &extent_end, + EXTENT_DIRTY, NULL)) + return 0; + + /* This pinned extent is out of our range */ + if (extent_start >= block_group->start + block_group->length) + return 0; + + extent_start = max(extent_start, start); + extent_end = min(block_group->start + block_group->length, + extent_end + 1); + len = extent_end - extent_start; + + *entries += 1; + ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); + if (ret) + return -ENOSPC; + + start = extent_end; + } + + return 0; +} + +static noinline_for_stack int +write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) +{ + struct btrfs_free_space *entry, *next; + int ret; + + /* Write out the bitmaps */ + list_for_each_entry_safe(entry, next, bitmap_list, list) { + ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); + if (ret) + return -ENOSPC; + list_del_init(&entry->list); + } + + return 0; +} + +static int flush_dirty_cache(struct inode *inode) +{ + int ret; + + ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (ret) + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DELALLOC, NULL); + + return ret; +} + +static void noinline_for_stack +cleanup_bitmap_list(struct list_head *bitmap_list) +{ + struct btrfs_free_space *entry, *next; + + list_for_each_entry_safe(entry, next, bitmap_list, list) + list_del_init(&entry->list); +} + +static void noinline_for_stack +cleanup_write_cache_enospc(struct inode *inode, + struct btrfs_io_ctl *io_ctl, + struct extent_state **cached_state) +{ + io_ctl_drop_pages(io_ctl); + unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + cached_state); +} + +static int __btrfs_wait_cache_io(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_io_ctl *io_ctl, + struct btrfs_path *path, u64 offset) +{ + int ret; + struct inode *inode = io_ctl->inode; + + if (!inode) + return 0; + + /* Flush the dirty pages in the cache file. */ + ret = flush_dirty_cache(inode); + if (ret) + goto out; + + /* Update the cache item to tell everyone this cache file is valid. */ + ret = update_cache_item(trans, root, inode, path, offset, + io_ctl->entries, io_ctl->bitmaps); +out: + if (ret) { + invalidate_inode_pages2(inode->i_mapping); + BTRFS_I(inode)->generation = 0; + if (block_group) + btrfs_debug(root->fs_info, + "failed to write free space cache for block group %llu error %d", + block_group->start, ret); + } + btrfs_update_inode(trans, root, BTRFS_I(inode)); + + if (block_group) { + /* the dirty list is protected by the dirty_bgs_lock */ + spin_lock(&trans->transaction->dirty_bgs_lock); + + /* the disk_cache_state is protected by the block group lock */ + spin_lock(&block_group->lock); + + /* + * only mark this as written if we didn't get put back on + * the dirty list while waiting for IO. Otherwise our + * cache state won't be right, and we won't get written again + */ + if (!ret && list_empty(&block_group->dirty_list)) + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + else if (ret) + block_group->disk_cache_state = BTRFS_DC_ERROR; + + spin_unlock(&block_group->lock); + spin_unlock(&trans->transaction->dirty_bgs_lock); + io_ctl->inode = NULL; + iput(inode); + } + + return ret; + +} + +int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path) +{ + return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, + block_group, &block_group->io_ctl, + path, block_group->start); +} + +/* + * Write out cached info to an inode. + * + * @root: root the inode belongs to + * @inode: freespace inode we are writing out + * @ctl: free space cache we are going to write out + * @block_group: block_group for this cache if it belongs to a block_group + * @io_ctl: holds context for the io + * @trans: the trans handle + * + * This function writes out a free space cache struct to disk for quick recovery + * on mount. This will return 0 if it was successful in writing the cache out, + * or an errno if it was not. + */ +static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, + struct btrfs_free_space_ctl *ctl, + struct btrfs_block_group *block_group, + struct btrfs_io_ctl *io_ctl, + struct btrfs_trans_handle *trans) +{ + struct extent_state *cached_state = NULL; + LIST_HEAD(bitmap_list); + int entries = 0; + int bitmaps = 0; + int ret; + int must_iput = 0; + + if (!i_size_read(inode)) + return -EIO; + + WARN_ON(io_ctl->pages); + ret = io_ctl_init(io_ctl, inode, 1); + if (ret) + return ret; + + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { + down_write(&block_group->data_rwsem); + spin_lock(&block_group->lock); + if (block_group->delalloc_bytes) { + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + up_write(&block_group->data_rwsem); + BTRFS_I(inode)->generation = 0; + ret = 0; + must_iput = 1; + goto out; + } + spin_unlock(&block_group->lock); + } + + /* Lock all pages first so we can lock the extent safely. */ + ret = io_ctl_prepare_pages(io_ctl, false); + if (ret) + goto out_unlock; + + lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); + + io_ctl_set_generation(io_ctl, trans->transid); + + mutex_lock(&ctl->cache_writeout_mutex); + /* Write out the extent entries in the free space cache */ + spin_lock(&ctl->tree_lock); + ret = write_cache_extent_entries(io_ctl, ctl, + block_group, &entries, &bitmaps, + &bitmap_list); + if (ret) + goto out_nospc_locked; + + /* + * Some spaces that are freed in the current transaction are pinned, + * they will be added into free space cache after the transaction is + * committed, we shouldn't lose them. + * + * If this changes while we are working we'll get added back to + * the dirty list and redo it. No locking needed + */ + ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); + if (ret) + goto out_nospc_locked; + + /* + * At last, we write out all the bitmaps and keep cache_writeout_mutex + * locked while doing it because a concurrent trim can be manipulating + * or freeing the bitmap. + */ + ret = write_bitmap_entries(io_ctl, &bitmap_list); + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + if (ret) + goto out_nospc; + + /* Zero out the rest of the pages just to make sure */ + io_ctl_zero_remaining_pages(io_ctl); + + /* Everything is written out, now we dirty the pages in the file. */ + ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, + io_ctl->num_pages, 0, i_size_read(inode), + &cached_state, false); + if (ret) + goto out_nospc; + + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + up_write(&block_group->data_rwsem); + /* + * Release the pages and unlock the extent, we will flush + * them out later + */ + io_ctl_drop_pages(io_ctl); + io_ctl_free(io_ctl); + + unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); + + /* + * at this point the pages are under IO and we're happy, + * The caller is responsible for waiting on them and updating + * the cache and the inode + */ + io_ctl->entries = entries; + io_ctl->bitmaps = bitmaps; + + ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); + if (ret) + goto out; + + return 0; + +out_nospc_locked: + cleanup_bitmap_list(&bitmap_list); + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + +out_nospc: + cleanup_write_cache_enospc(inode, io_ctl, &cached_state); + +out_unlock: + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + up_write(&block_group->data_rwsem); + +out: + io_ctl->inode = NULL; + io_ctl_free(io_ctl); + if (ret) { + invalidate_inode_pages2(inode->i_mapping); + BTRFS_I(inode)->generation = 0; + } + btrfs_update_inode(trans, root, BTRFS_I(inode)); + if (must_iput) + iput(inode); + return ret; +} + +int btrfs_write_out_cache(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct inode *inode; + int ret = 0; + + spin_lock(&block_group->lock); + if (block_group->disk_cache_state < BTRFS_DC_SETUP) { + spin_unlock(&block_group->lock); + return 0; + } + spin_unlock(&block_group->lock); + + inode = lookup_free_space_inode(block_group, path); + if (IS_ERR(inode)) + return 0; + + ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, + block_group, &block_group->io_ctl, trans); + if (ret) { + btrfs_debug(fs_info, + "failed to write free space cache for block group %llu error %d", + block_group->start, ret); + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_ERROR; + spin_unlock(&block_group->lock); + + block_group->io_ctl.inode = NULL; + iput(inode); + } + + /* + * if ret == 0 the caller is expected to call btrfs_wait_cache_io + * to wait for IO and put the inode + */ + + return ret; +} + +static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, + u64 offset) +{ + ASSERT(offset >= bitmap_start); + offset -= bitmap_start; + return (unsigned long)(div_u64(offset, unit)); +} + +static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) +{ + return (unsigned long)(div_u64(bytes, unit)); +} + +static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, + u64 offset) +{ + u64 bitmap_start; + u64 bytes_per_bitmap; + + bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; + bitmap_start = offset - ctl->start; + bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); + bitmap_start *= bytes_per_bitmap; + bitmap_start += ctl->start; + + return bitmap_start; +} + +static int tree_insert_offset(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_cluster *cluster, + struct btrfs_free_space *new_entry) +{ + struct rb_root *root; + struct rb_node **p; + struct rb_node *parent = NULL; + + lockdep_assert_held(&ctl->tree_lock); + + if (cluster) { + lockdep_assert_held(&cluster->lock); + root = &cluster->root; + } else { + root = &ctl->free_space_offset; + } + + p = &root->rb_node; + + while (*p) { + struct btrfs_free_space *info; + + parent = *p; + info = rb_entry(parent, struct btrfs_free_space, offset_index); + + if (new_entry->offset < info->offset) { + p = &(*p)->rb_left; + } else if (new_entry->offset > info->offset) { + p = &(*p)->rb_right; + } else { + /* + * we could have a bitmap entry and an extent entry + * share the same offset. If this is the case, we want + * the extent entry to always be found first if we do a + * linear search through the tree, since we want to have + * the quickest allocation time, and allocating from an + * extent is faster than allocating from a bitmap. So + * if we're inserting a bitmap and we find an entry at + * this offset, we want to go right, or after this entry + * logically. If we are inserting an extent and we've + * found a bitmap, we want to go left, or before + * logically. + */ + if (new_entry->bitmap) { + if (info->bitmap) { + WARN_ON_ONCE(1); + return -EEXIST; + } + p = &(*p)->rb_right; + } else { + if (!info->bitmap) { + WARN_ON_ONCE(1); + return -EEXIST; + } + p = &(*p)->rb_left; + } + } + } + + rb_link_node(&new_entry->offset_index, parent, p); + rb_insert_color(&new_entry->offset_index, root); + + return 0; +} + +/* + * This is a little subtle. We *only* have ->max_extent_size set if we actually + * searched through the bitmap and figured out the largest ->max_extent_size, + * otherwise it's 0. In the case that it's 0 we don't want to tell the + * allocator the wrong thing, we want to use the actual real max_extent_size + * we've found already if it's larger, or we want to use ->bytes. + * + * This matters because find_free_space() will skip entries who's ->bytes is + * less than the required bytes. So if we didn't search down this bitmap, we + * may pick some previous entry that has a smaller ->max_extent_size than we + * have. For example, assume we have two entries, one that has + * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set + * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will + * call into find_free_space(), and return with max_extent_size == 4K, because + * that first bitmap entry had ->max_extent_size set, but the second one did + * not. If instead we returned 8K we'd come in searching for 8K, and find the + * 8K contiguous range. + * + * Consider the other case, we have 2 8K chunks in that second entry and still + * don't have ->max_extent_size set. We'll return 16K, and the next time the + * allocator comes in it'll fully search our second bitmap, and this time it'll + * get an uptodate value of 8K as the maximum chunk size. Then we'll get the + * right allocation the next loop through. + */ +static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) +{ + if (entry->bitmap && entry->max_extent_size) + return entry->max_extent_size; + return entry->bytes; +} + +/* + * We want the largest entry to be leftmost, so this is inverted from what you'd + * normally expect. + */ +static bool entry_less(struct rb_node *node, const struct rb_node *parent) +{ + const struct btrfs_free_space *entry, *exist; + + entry = rb_entry(node, struct btrfs_free_space, bytes_index); + exist = rb_entry(parent, struct btrfs_free_space, bytes_index); + return get_max_extent_size(exist) < get_max_extent_size(entry); +} + +/* + * searches the tree for the given offset. + * + * fuzzy - If this is set, then we are trying to make an allocation, and we just + * want a section that has at least bytes size and comes at or after the given + * offset. + */ +static struct btrfs_free_space * +tree_search_offset(struct btrfs_free_space_ctl *ctl, + u64 offset, int bitmap_only, int fuzzy) +{ + struct rb_node *n = ctl->free_space_offset.rb_node; + struct btrfs_free_space *entry = NULL, *prev = NULL; + + lockdep_assert_held(&ctl->tree_lock); + + /* find entry that is closest to the 'offset' */ + while (n) { + entry = rb_entry(n, struct btrfs_free_space, offset_index); + prev = entry; + + if (offset < entry->offset) + n = n->rb_left; + else if (offset > entry->offset) + n = n->rb_right; + else + break; + + entry = NULL; + } + + if (bitmap_only) { + if (!entry) + return NULL; + if (entry->bitmap) + return entry; + + /* + * bitmap entry and extent entry may share same offset, + * in that case, bitmap entry comes after extent entry. + */ + n = rb_next(n); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); + if (entry->offset != offset) + return NULL; + + WARN_ON(!entry->bitmap); + return entry; + } else if (entry) { + if (entry->bitmap) { + /* + * if previous extent entry covers the offset, + * we should return it instead of the bitmap entry + */ + n = rb_prev(&entry->offset_index); + if (n) { + prev = rb_entry(n, struct btrfs_free_space, + offset_index); + if (!prev->bitmap && + prev->offset + prev->bytes > offset) + entry = prev; + } + } + return entry; + } + + if (!prev) + return NULL; + + /* find last entry before the 'offset' */ + entry = prev; + if (entry->offset > offset) { + n = rb_prev(&entry->offset_index); + if (n) { + entry = rb_entry(n, struct btrfs_free_space, + offset_index); + ASSERT(entry->offset <= offset); + } else { + if (fuzzy) + return entry; + else + return NULL; + } + } + + if (entry->bitmap) { + n = rb_prev(&entry->offset_index); + if (n) { + prev = rb_entry(n, struct btrfs_free_space, + offset_index); + if (!prev->bitmap && + prev->offset + prev->bytes > offset) + return prev; + } + if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) + return entry; + } else if (entry->offset + entry->bytes > offset) + return entry; + + if (!fuzzy) + return NULL; + + while (1) { + n = rb_next(&entry->offset_index); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); + if (entry->bitmap) { + if (entry->offset + BITS_PER_BITMAP * + ctl->unit > offset) + break; + } else { + if (entry->offset + entry->bytes > offset) + break; + } + } + return entry; +} + +static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + bool update_stat) +{ + lockdep_assert_held(&ctl->tree_lock); + + rb_erase(&info->offset_index, &ctl->free_space_offset); + rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); + ctl->free_extents--; + + if (!info->bitmap && !btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR]--; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; + } + + if (update_stat) + ctl->free_space -= info->bytes; +} + +static int link_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + int ret = 0; + + lockdep_assert_held(&ctl->tree_lock); + + ASSERT(info->bytes || info->bitmap); + ret = tree_insert_offset(ctl, NULL, info); + if (ret) + return ret; + + rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); + + if (!info->bitmap && !btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR]++; + ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; + } + + ctl->free_space += info->bytes; + ctl->free_extents++; + return ret; +} + +static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + ASSERT(info->bitmap); + + /* + * If our entry is empty it's because we're on a cluster and we don't + * want to re-link it into our ctl bytes index. + */ + if (RB_EMPTY_NODE(&info->bytes_index)) + return; + + lockdep_assert_held(&ctl->tree_lock); + + rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); + rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); +} + +static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + u64 offset, u64 bytes, bool update_stat) +{ + unsigned long start, count, end; + int extent_delta = -1; + + start = offset_to_bit(info->offset, ctl->unit, offset); + count = bytes_to_bits(bytes, ctl->unit); + end = start + count; + ASSERT(end <= BITS_PER_BITMAP); + + bitmap_clear(info->bitmap, start, count); + + info->bytes -= bytes; + if (info->max_extent_size > ctl->unit) + info->max_extent_size = 0; + + relink_bitmap_entry(ctl, info); + + if (start && test_bit(start - 1, info->bitmap)) + extent_delta++; + + if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) + extent_delta++; + + info->bitmap_extents += extent_delta; + if (!btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; + } + + if (update_stat) + ctl->free_space -= bytes; +} + +static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + unsigned long start, count, end; + int extent_delta = 1; + + start = offset_to_bit(info->offset, ctl->unit, offset); + count = bytes_to_bits(bytes, ctl->unit); + end = start + count; + ASSERT(end <= BITS_PER_BITMAP); + + bitmap_set(info->bitmap, start, count); + + /* + * We set some bytes, we have no idea what the max extent size is + * anymore. + */ + info->max_extent_size = 0; + info->bytes += bytes; + ctl->free_space += bytes; + + relink_bitmap_entry(ctl, info); + + if (start && test_bit(start - 1, info->bitmap)) + extent_delta--; + + if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) + extent_delta--; + + info->bitmap_extents += extent_delta; + if (!btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; + ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; + } +} + +/* + * If we can not find suitable extent, we will use bytes to record + * the size of the max extent. + */ +static int search_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info, u64 *offset, + u64 *bytes, bool for_alloc) +{ + unsigned long found_bits = 0; + unsigned long max_bits = 0; + unsigned long bits, i; + unsigned long next_zero; + unsigned long extent_bits; + + /* + * Skip searching the bitmap if we don't have a contiguous section that + * is large enough for this allocation. + */ + if (for_alloc && + bitmap_info->max_extent_size && + bitmap_info->max_extent_size < *bytes) { + *bytes = bitmap_info->max_extent_size; + return -1; + } + + i = offset_to_bit(bitmap_info->offset, ctl->unit, + max_t(u64, *offset, bitmap_info->offset)); + bits = bytes_to_bits(*bytes, ctl->unit); + + for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { + if (for_alloc && bits == 1) { + found_bits = 1; + break; + } + next_zero = find_next_zero_bit(bitmap_info->bitmap, + BITS_PER_BITMAP, i); + extent_bits = next_zero - i; + if (extent_bits >= bits) { + found_bits = extent_bits; + break; + } else if (extent_bits > max_bits) { + max_bits = extent_bits; + } + i = next_zero; + } + + if (found_bits) { + *offset = (u64)(i * ctl->unit) + bitmap_info->offset; + *bytes = (u64)(found_bits) * ctl->unit; + return 0; + } + + *bytes = (u64)(max_bits) * ctl->unit; + bitmap_info->max_extent_size = *bytes; + relink_bitmap_entry(ctl, bitmap_info); + return -1; +} + +/* Cache the size of the max extent in bytes */ +static struct btrfs_free_space * +find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, + unsigned long align, u64 *max_extent_size, bool use_bytes_index) +{ + struct btrfs_free_space *entry; + struct rb_node *node; + u64 tmp; + u64 align_off; + int ret; + + if (!ctl->free_space_offset.rb_node) + goto out; +again: + if (use_bytes_index) { + node = rb_first_cached(&ctl->free_space_bytes); + } else { + entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), + 0, 1); + if (!entry) + goto out; + node = &entry->offset_index; + } + + for (; node; node = rb_next(node)) { + if (use_bytes_index) + entry = rb_entry(node, struct btrfs_free_space, + bytes_index); + else + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + + /* + * If we are using the bytes index then all subsequent entries + * in this tree are going to be < bytes, so simply set the max + * extent size and exit the loop. + * + * If we're using the offset index then we need to keep going + * through the rest of the tree. + */ + if (entry->bytes < *bytes) { + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); + if (use_bytes_index) + break; + continue; + } + + /* make sure the space returned is big enough + * to match our requested alignment + */ + if (*bytes >= align) { + tmp = entry->offset - ctl->start + align - 1; + tmp = div64_u64(tmp, align); + tmp = tmp * align + ctl->start; + align_off = tmp - entry->offset; + } else { + align_off = 0; + tmp = entry->offset; + } + + /* + * We don't break here if we're using the bytes index because we + * may have another entry that has the correct alignment that is + * the right size, so we don't want to miss that possibility. + * At worst this adds another loop through the logic, but if we + * broke here we could prematurely ENOSPC. + */ + if (entry->bytes < *bytes + align_off) { + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); + continue; + } + + if (entry->bitmap) { + struct rb_node *old_next = rb_next(node); + u64 size = *bytes; + + ret = search_bitmap(ctl, entry, &tmp, &size, true); + if (!ret) { + *offset = tmp; + *bytes = size; + return entry; + } else { + *max_extent_size = + max(get_max_extent_size(entry), + *max_extent_size); + } + + /* + * The bitmap may have gotten re-arranged in the space + * index here because the max_extent_size may have been + * updated. Start from the beginning again if this + * happened. + */ + if (use_bytes_index && old_next != rb_next(node)) + goto again; + continue; + } + + *offset = tmp; + *bytes = entry->bytes - align_off; + return entry; + } +out: + return NULL; +} + +static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset) +{ + info->offset = offset_to_bitmap(ctl, offset); + info->bytes = 0; + info->bitmap_extents = 0; + INIT_LIST_HEAD(&info->list); + link_free_space(ctl, info); + ctl->total_bitmaps++; + recalculate_thresholds(ctl); +} + +static void free_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info) +{ + /* + * Normally when this is called, the bitmap is completely empty. However, + * if we are blowing up the free space cache for one reason or another + * via __btrfs_remove_free_space_cache(), then it may not be freed and + * we may leave stats on the table. + */ + if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { + ctl->discardable_extents[BTRFS_STAT_CURR] -= + bitmap_info->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; + + } + unlink_free_space(ctl, bitmap_info, true); + kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); + kmem_cache_free(btrfs_free_space_cachep, bitmap_info); + ctl->total_bitmaps--; + recalculate_thresholds(ctl); +} + +static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info, + u64 *offset, u64 *bytes) +{ + u64 end; + u64 search_start, search_bytes; + int ret; + +again: + end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; + + /* + * We need to search for bits in this bitmap. We could only cover some + * of the extent in this bitmap thanks to how we add space, so we need + * to search for as much as it as we can and clear that amount, and then + * go searching for the next bit. + */ + search_start = *offset; + search_bytes = ctl->unit; + search_bytes = min(search_bytes, end - search_start + 1); + ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, + false); + if (ret < 0 || search_start != *offset) + return -EINVAL; + + /* We may have found more bits than what we need */ + search_bytes = min(search_bytes, *bytes); + + /* Cannot clear past the end of the bitmap */ + search_bytes = min(search_bytes, end - search_start + 1); + + bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); + *offset += search_bytes; + *bytes -= search_bytes; + + if (*bytes) { + struct rb_node *next = rb_next(&bitmap_info->offset_index); + if (!bitmap_info->bytes) + free_bitmap(ctl, bitmap_info); + + /* + * no entry after this bitmap, but we still have bytes to + * remove, so something has gone wrong. + */ + if (!next) + return -EINVAL; + + bitmap_info = rb_entry(next, struct btrfs_free_space, + offset_index); + + /* + * if the next entry isn't a bitmap we need to return to let the + * extent stuff do its work. + */ + if (!bitmap_info->bitmap) + return -EAGAIN; + + /* + * Ok the next item is a bitmap, but it may not actually hold + * the information for the rest of this free space stuff, so + * look for it, and if we don't find it return so we can try + * everything over again. + */ + search_start = *offset; + search_bytes = ctl->unit; + ret = search_bitmap(ctl, bitmap_info, &search_start, + &search_bytes, false); + if (ret < 0 || search_start != *offset) + return -EAGAIN; + + goto again; + } else if (!bitmap_info->bytes) + free_bitmap(ctl, bitmap_info); + + return 0; +} + +static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes, enum btrfs_trim_state trim_state) +{ + u64 bytes_to_set = 0; + u64 end; + + /* + * This is a tradeoff to make bitmap trim state minimal. We mark the + * whole bitmap untrimmed if at any point we add untrimmed regions. + */ + if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { + if (btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += + info->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; + } + info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + } + + end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); + + bytes_to_set = min(end - offset, bytes); + + bitmap_set_bits(ctl, info, offset, bytes_to_set); + + return bytes_to_set; + +} + +static bool use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + struct btrfs_block_group *block_group = ctl->block_group; + struct btrfs_fs_info *fs_info = block_group->fs_info; + bool forced = false; + +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(block_group)) + forced = true; +#endif + + /* This is a way to reclaim large regions from the bitmaps. */ + if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) + return false; + + /* + * If we are below the extents threshold then we can add this as an + * extent, and don't have to deal with the bitmap + */ + if (!forced && ctl->free_extents < ctl->extents_thresh) { + /* + * If this block group has some small extents we don't want to + * use up all of our free slots in the cache with them, we want + * to reserve them to larger extents, however if we have plenty + * of cache left then go ahead an dadd them, no sense in adding + * the overhead of a bitmap if we don't have to. + */ + if (info->bytes <= fs_info->sectorsize * 8) { + if (ctl->free_extents * 3 <= ctl->extents_thresh) + return false; + } else { + return false; + } + } + + /* + * The original block groups from mkfs can be really small, like 8 + * megabytes, so don't bother with a bitmap for those entries. However + * some block groups can be smaller than what a bitmap would cover but + * are still large enough that they could overflow the 32k memory limit, + * so allow those block groups to still be allowed to have a bitmap + * entry. + */ + if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) + return false; + + return true; +} + +static const struct btrfs_free_space_op free_space_op = { + .use_bitmap = use_bitmap, +}; + +static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + struct btrfs_free_space *bitmap_info; + struct btrfs_block_group *block_group = NULL; + int added = 0; + u64 bytes, offset, bytes_added; + enum btrfs_trim_state trim_state; + int ret; + + bytes = info->bytes; + offset = info->offset; + trim_state = info->trim_state; + + if (!ctl->op->use_bitmap(ctl, info)) + return 0; + + if (ctl->op == &free_space_op) + block_group = ctl->block_group; +again: + /* + * Since we link bitmaps right into the cluster we need to see if we + * have a cluster here, and if so and it has our bitmap we need to add + * the free space to that bitmap. + */ + if (block_group && !list_empty(&block_group->cluster_list)) { + struct btrfs_free_cluster *cluster; + struct rb_node *node; + struct btrfs_free_space *entry; + + cluster = list_entry(block_group->cluster_list.next, + struct btrfs_free_cluster, + block_group_list); + spin_lock(&cluster->lock); + node = rb_first(&cluster->root); + if (!node) { + spin_unlock(&cluster->lock); + goto no_cluster_bitmap; + } + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + if (!entry->bitmap) { + spin_unlock(&cluster->lock); + goto no_cluster_bitmap; + } + + if (entry->offset == offset_to_bitmap(ctl, offset)) { + bytes_added = add_bytes_to_bitmap(ctl, entry, offset, + bytes, trim_state); + bytes -= bytes_added; + offset += bytes_added; + } + spin_unlock(&cluster->lock); + if (!bytes) { + ret = 1; + goto out; + } + } + +no_cluster_bitmap: + bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!bitmap_info) { + ASSERT(added == 0); + goto new_bitmap; + } + + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, + trim_state); + bytes -= bytes_added; + offset += bytes_added; + added = 0; + + if (!bytes) { + ret = 1; + goto out; + } else + goto again; + +new_bitmap: + if (info && info->bitmap) { + add_new_bitmap(ctl, info, offset); + added = 1; + info = NULL; + goto again; + } else { + spin_unlock(&ctl->tree_lock); + + /* no pre-allocated info, allocate a new one */ + if (!info) { + info = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); + if (!info) { + spin_lock(&ctl->tree_lock); + ret = -ENOMEM; + goto out; + } + } + + /* allocate the bitmap */ + info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, + GFP_NOFS); + info->trim_state = BTRFS_TRIM_STATE_TRIMMED; + spin_lock(&ctl->tree_lock); + if (!info->bitmap) { + ret = -ENOMEM; + goto out; + } + goto again; + } + +out: + if (info) { + if (info->bitmap) + kmem_cache_free(btrfs_free_space_bitmap_cachep, + info->bitmap); + kmem_cache_free(btrfs_free_space_cachep, info); + } + + return ret; +} + +/* + * Free space merging rules: + * 1) Merge trimmed areas together + * 2) Let untrimmed areas coalesce with trimmed areas + * 3) Always pull neighboring regions from bitmaps + * + * The above rules are for when we merge free space based on btrfs_trim_state. + * Rules 2 and 3 are subtle because they are suboptimal, but are done for the + * same reason: to promote larger extent regions which makes life easier for + * find_free_extent(). Rule 2 enables coalescing based on the common path + * being returning free space from btrfs_finish_extent_commit(). So when free + * space is trimmed, it will prevent aggregating trimmed new region and + * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents + * and provide find_free_extent() with the largest extents possible hoping for + * the reuse path. + */ +static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, bool update_stat) +{ + struct btrfs_free_space *left_info = NULL; + struct btrfs_free_space *right_info; + bool merged = false; + u64 offset = info->offset; + u64 bytes = info->bytes; + const bool is_trimmed = btrfs_free_space_trimmed(info); + struct rb_node *right_prev = NULL; + + /* + * first we want to see if there is free space adjacent to the range we + * are adding, if there is remove that struct and add a new one to + * cover the entire range + */ + right_info = tree_search_offset(ctl, offset + bytes, 0, 0); + if (right_info) + right_prev = rb_prev(&right_info->offset_index); + + if (right_prev) + left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index); + else if (!right_info) + left_info = tree_search_offset(ctl, offset - 1, 0, 0); + + /* See try_merge_free_space() comment. */ + if (right_info && !right_info->bitmap && + (!is_trimmed || btrfs_free_space_trimmed(right_info))) { + unlink_free_space(ctl, right_info, update_stat); + info->bytes += right_info->bytes; + kmem_cache_free(btrfs_free_space_cachep, right_info); + merged = true; + } + + /* See try_merge_free_space() comment. */ + if (left_info && !left_info->bitmap && + left_info->offset + left_info->bytes == offset && + (!is_trimmed || btrfs_free_space_trimmed(left_info))) { + unlink_free_space(ctl, left_info, update_stat); + info->offset = left_info->offset; + info->bytes += left_info->bytes; + kmem_cache_free(btrfs_free_space_cachep, left_info); + merged = true; + } + + return merged; +} + +static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + bool update_stat) +{ + struct btrfs_free_space *bitmap; + unsigned long i; + unsigned long j; + const u64 end = info->offset + info->bytes; + const u64 bitmap_offset = offset_to_bitmap(ctl, end); + u64 bytes; + + bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (!bitmap) + return false; + + i = offset_to_bit(bitmap->offset, ctl->unit, end); + j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); + if (j == i) + return false; + bytes = (j - i) * ctl->unit; + info->bytes += bytes; + + /* See try_merge_free_space() comment. */ + if (!btrfs_free_space_trimmed(bitmap)) + info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + + bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); + + if (!bitmap->bytes) + free_bitmap(ctl, bitmap); + + return true; +} + +static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + bool update_stat) +{ + struct btrfs_free_space *bitmap; + u64 bitmap_offset; + unsigned long i; + unsigned long j; + unsigned long prev_j; + u64 bytes; + + bitmap_offset = offset_to_bitmap(ctl, info->offset); + /* If we're on a boundary, try the previous logical bitmap. */ + if (bitmap_offset == info->offset) { + if (info->offset == 0) + return false; + bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); + } + + bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (!bitmap) + return false; + + i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; + j = 0; + prev_j = (unsigned long)-1; + for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { + if (j > i) + break; + prev_j = j; + } + if (prev_j == i) + return false; + + if (prev_j == (unsigned long)-1) + bytes = (i + 1) * ctl->unit; + else + bytes = (i - prev_j) * ctl->unit; + + info->offset -= bytes; + info->bytes += bytes; + + /* See try_merge_free_space() comment. */ + if (!btrfs_free_space_trimmed(bitmap)) + info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + + bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); + + if (!bitmap->bytes) + free_bitmap(ctl, bitmap); + + return true; +} + +/* + * We prefer always to allocate from extent entries, both for clustered and + * non-clustered allocation requests. So when attempting to add a new extent + * entry, try to see if there's adjacent free space in bitmap entries, and if + * there is, migrate that space from the bitmaps to the extent. + * Like this we get better chances of satisfying space allocation requests + * because we attempt to satisfy them based on a single cache entry, and never + * on 2 or more entries - even if the entries represent a contiguous free space + * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry + * ends). + */ +static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + bool update_stat) +{ + /* + * Only work with disconnected entries, as we can change their offset, + * and must be extent entries. + */ + ASSERT(!info->bitmap); + ASSERT(RB_EMPTY_NODE(&info->offset_index)); + + if (ctl->total_bitmaps > 0) { + bool stole_end; + bool stole_front = false; + + stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); + if (ctl->total_bitmaps > 0) + stole_front = steal_from_bitmap_to_front(ctl, info, + update_stat); + + if (stole_end || stole_front) + try_merge_free_space(ctl, info, update_stat); + } +} + +int __btrfs_add_free_space(struct btrfs_block_group *block_group, + u64 offset, u64 bytes, + enum btrfs_trim_state trim_state) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *info; + int ret = 0; + u64 filter_bytes = bytes; + + ASSERT(!btrfs_is_zoned(fs_info)); + + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); + if (!info) + return -ENOMEM; + + info->offset = offset; + info->bytes = bytes; + info->trim_state = trim_state; + RB_CLEAR_NODE(&info->offset_index); + RB_CLEAR_NODE(&info->bytes_index); + + spin_lock(&ctl->tree_lock); + + if (try_merge_free_space(ctl, info, true)) + goto link; + + /* + * There was no extent directly to the left or right of this new + * extent then we know we're going to have to allocate a new extent, so + * before we do that see if we need to drop this into a bitmap + */ + ret = insert_into_bitmap(ctl, info); + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + goto out; + } +link: + /* + * Only steal free space from adjacent bitmaps if we're sure we're not + * going to add the new free space to existing bitmap entries - because + * that would mean unnecessary work that would be reverted. Therefore + * attempt to steal space from bitmaps if we're adding an extent entry. + */ + steal_from_bitmap(ctl, info, true); + + filter_bytes = max(filter_bytes, info->bytes); + + ret = link_free_space(ctl, info); + if (ret) + kmem_cache_free(btrfs_free_space_cachep, info); +out: + btrfs_discard_update_discardable(block_group); + spin_unlock(&ctl->tree_lock); + + if (ret) { + btrfs_crit(fs_info, "unable to add free space :%d", ret); + ASSERT(ret != -EEXIST); + } + + if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { + btrfs_discard_check_filter(block_group, filter_bytes); + btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); + } + + return ret; +} + +static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, + u64 bytenr, u64 size, bool used) +{ + struct btrfs_space_info *sinfo = block_group->space_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + u64 offset = bytenr - block_group->start; + u64 to_free, to_unusable; + int bg_reclaim_threshold = 0; + bool initial = (size == block_group->length); + u64 reclaimable_unusable; + + WARN_ON(!initial && offset + size > block_group->zone_capacity); + + if (!initial) + bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); + + spin_lock(&ctl->tree_lock); + if (!used) + to_free = size; + else if (initial) + to_free = block_group->zone_capacity; + else if (offset >= block_group->alloc_offset) + to_free = size; + else if (offset + size <= block_group->alloc_offset) + to_free = 0; + else + to_free = offset + size - block_group->alloc_offset; + to_unusable = size - to_free; + + ctl->free_space += to_free; + /* + * If the block group is read-only, we should account freed space into + * bytes_readonly. + */ + if (!block_group->ro) + block_group->zone_unusable += to_unusable; + spin_unlock(&ctl->tree_lock); + if (!used) { + spin_lock(&block_group->lock); + block_group->alloc_offset -= size; + spin_unlock(&block_group->lock); + } + + reclaimable_unusable = block_group->zone_unusable - + (block_group->length - block_group->zone_capacity); + /* All the region is now unusable. Mark it as unused and reclaim */ + if (block_group->zone_unusable == block_group->length) { + btrfs_mark_bg_unused(block_group); + } else if (bg_reclaim_threshold && + reclaimable_unusable >= + mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { + btrfs_mark_bg_to_reclaim(block_group); + } + + return 0; +} + +int btrfs_add_free_space(struct btrfs_block_group *block_group, + u64 bytenr, u64 size) +{ + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + true); + + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) + trim_state = BTRFS_TRIM_STATE_TRIMMED; + + return __btrfs_add_free_space(block_group, bytenr, size, trim_state); +} + +int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, + u64 bytenr, u64 size) +{ + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + false); + + return btrfs_add_free_space(block_group, bytenr, size); +} + +/* + * This is a subtle distinction because when adding free space back in general, + * we want it to be added as untrimmed for async. But in the case where we add + * it on loading of a block group, we want to consider it trimmed. + */ +int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, + u64 bytenr, u64 size) +{ + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + true); + + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || + btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) + trim_state = BTRFS_TRIM_STATE_TRIMMED; + + return __btrfs_add_free_space(block_group, bytenr, size, trim_state); +} + +int btrfs_remove_free_space(struct btrfs_block_group *block_group, + u64 offset, u64 bytes) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *info; + int ret; + bool re_search = false; + + if (btrfs_is_zoned(block_group->fs_info)) { + /* + * This can happen with conventional zones when replaying log. + * Since the allocation info of tree-log nodes are not recorded + * to the extent-tree, calculate_alloc_pointer() failed to + * advance the allocation pointer after last allocated tree log + * node blocks. + * + * This function is called from + * btrfs_pin_extent_for_log_replay() when replaying the log. + * Advance the pointer not to overwrite the tree-log nodes. + */ + if (block_group->start + block_group->alloc_offset < + offset + bytes) { + block_group->alloc_offset = + offset + bytes - block_group->start; + } + return 0; + } + + spin_lock(&ctl->tree_lock); + +again: + ret = 0; + if (!bytes) + goto out_lock; + + info = tree_search_offset(ctl, offset, 0, 0); + if (!info) { + /* + * oops didn't find an extent that matched the space we wanted + * to remove, look for a bitmap instead + */ + info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!info) { + /* + * If we found a partial bit of our free space in a + * bitmap but then couldn't find the other part this may + * be a problem, so WARN about it. + */ + WARN_ON(re_search); + goto out_lock; + } + } + + re_search = false; + if (!info->bitmap) { + unlink_free_space(ctl, info, true); + if (offset == info->offset) { + u64 to_free = min(bytes, info->bytes); + + info->bytes -= to_free; + info->offset += to_free; + if (info->bytes) { + ret = link_free_space(ctl, info); + WARN_ON(ret); + } else { + kmem_cache_free(btrfs_free_space_cachep, info); + } + + offset += to_free; + bytes -= to_free; + goto again; + } else { + u64 old_end = info->bytes + info->offset; + + info->bytes = offset - info->offset; + ret = link_free_space(ctl, info); + WARN_ON(ret); + if (ret) + goto out_lock; + + /* Not enough bytes in this entry to satisfy us */ + if (old_end < offset + bytes) { + bytes -= old_end - offset; + offset = old_end; + goto again; + } else if (old_end == offset + bytes) { + /* all done */ + goto out_lock; + } + spin_unlock(&ctl->tree_lock); + + ret = __btrfs_add_free_space(block_group, + offset + bytes, + old_end - (offset + bytes), + info->trim_state); + WARN_ON(ret); + goto out; + } + } + + ret = remove_from_bitmap(ctl, info, &offset, &bytes); + if (ret == -EAGAIN) { + re_search = true; + goto again; + } +out_lock: + btrfs_discard_update_discardable(block_group); + spin_unlock(&ctl->tree_lock); +out: + return ret; +} + +void btrfs_dump_free_space(struct btrfs_block_group *block_group, + u64 bytes) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *info; + struct rb_node *n; + int count = 0; + + /* + * Zoned btrfs does not use free space tree and cluster. Just print + * out the free space after the allocation offset. + */ + if (btrfs_is_zoned(fs_info)) { + btrfs_info(fs_info, "free space %llu active %d", + block_group->zone_capacity - block_group->alloc_offset, + test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &block_group->runtime_flags)); + return; + } + + spin_lock(&ctl->tree_lock); + for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + if (info->bytes >= bytes && !block_group->ro) + count++; + btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", + info->offset, info->bytes, + (info->bitmap) ? "yes" : "no"); + } + spin_unlock(&ctl->tree_lock); + btrfs_info(fs_info, "block group has cluster?: %s", + list_empty(&block_group->cluster_list) ? "no" : "yes"); + btrfs_info(fs_info, + "%d free space entries at or bigger than %llu bytes", + count, bytes); +} + +void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, + struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + + spin_lock_init(&ctl->tree_lock); + ctl->unit = fs_info->sectorsize; + ctl->start = block_group->start; + ctl->block_group = block_group; + ctl->op = &free_space_op; + ctl->free_space_bytes = RB_ROOT_CACHED; + INIT_LIST_HEAD(&ctl->trimming_ranges); + mutex_init(&ctl->cache_writeout_mutex); + + /* + * we only want to have 32k of ram per block group for keeping + * track of free space, and if we pass 1/2 of that we want to + * start converting things over to using bitmaps + */ + ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); +} + +/* + * for a given cluster, put all of its extents back into the free + * space cache. If the block group passed doesn't match the block group + * pointed to by the cluster, someone else raced in and freed the + * cluster already. In that case, we just return without changing anything + */ +static void __btrfs_return_cluster_to_free_space( + struct btrfs_block_group *block_group, + struct btrfs_free_cluster *cluster) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct rb_node *node; + + lockdep_assert_held(&ctl->tree_lock); + + spin_lock(&cluster->lock); + if (cluster->block_group != block_group) { + spin_unlock(&cluster->lock); + return; + } + + cluster->block_group = NULL; + cluster->window_start = 0; + list_del_init(&cluster->block_group_list); + + node = rb_first(&cluster->root); + while (node) { + struct btrfs_free_space *entry; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + node = rb_next(&entry->offset_index); + rb_erase(&entry->offset_index, &cluster->root); + RB_CLEAR_NODE(&entry->offset_index); + + if (!entry->bitmap) { + /* Merging treats extents as if they were new */ + if (!btrfs_free_space_trimmed(entry)) { + ctl->discardable_extents[BTRFS_STAT_CURR]--; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= + entry->bytes; + } + + try_merge_free_space(ctl, entry, false); + steal_from_bitmap(ctl, entry, false); + + /* As we insert directly, update these statistics */ + if (!btrfs_free_space_trimmed(entry)) { + ctl->discardable_extents[BTRFS_STAT_CURR]++; + ctl->discardable_bytes[BTRFS_STAT_CURR] += + entry->bytes; + } + } + tree_insert_offset(ctl, NULL, entry); + rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, + entry_less); + } + cluster->root = RB_ROOT; + spin_unlock(&cluster->lock); + btrfs_put_block_group(block_group); +} + +void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_cluster *cluster; + struct list_head *head; + + spin_lock(&ctl->tree_lock); + while ((head = block_group->cluster_list.next) != + &block_group->cluster_list) { + cluster = list_entry(head, struct btrfs_free_cluster, + block_group_list); + + WARN_ON(cluster->block_group != block_group); + __btrfs_return_cluster_to_free_space(block_group, cluster); + + cond_resched_lock(&ctl->tree_lock); + } + __btrfs_remove_free_space_cache(ctl); + btrfs_discard_update_discardable(block_group); + spin_unlock(&ctl->tree_lock); + +} + +/* + * Walk @block_group's free space rb_tree to determine if everything is trimmed. + */ +bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *info; + struct rb_node *node; + bool ret = true; + + spin_lock(&ctl->tree_lock); + node = rb_first(&ctl->free_space_offset); + + while (node) { + info = rb_entry(node, struct btrfs_free_space, offset_index); + + if (!btrfs_free_space_trimmed(info)) { + ret = false; + break; + } + + node = rb_next(node); + } + + spin_unlock(&ctl->tree_lock); + return ret; +} + +u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, + u64 offset, u64 bytes, u64 empty_size, + u64 *max_extent_size) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_discard_ctl *discard_ctl = + &block_group->fs_info->discard_ctl; + struct btrfs_free_space *entry = NULL; + u64 bytes_search = bytes + empty_size; + u64 ret = 0; + u64 align_gap = 0; + u64 align_gap_len = 0; + enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + bool use_bytes_index = (offset == block_group->start); + + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + + spin_lock(&ctl->tree_lock); + entry = find_free_space(ctl, &offset, &bytes_search, + block_group->full_stripe_len, max_extent_size, + use_bytes_index); + if (!entry) + goto out; + + ret = offset; + if (entry->bitmap) { + bitmap_clear_bits(ctl, entry, offset, bytes, true); + + if (!btrfs_free_space_trimmed(entry)) + atomic64_add(bytes, &discard_ctl->discard_bytes_saved); + + if (!entry->bytes) + free_bitmap(ctl, entry); + } else { + unlink_free_space(ctl, entry, true); + align_gap_len = offset - entry->offset; + align_gap = entry->offset; + align_gap_trim_state = entry->trim_state; + + if (!btrfs_free_space_trimmed(entry)) + atomic64_add(bytes, &discard_ctl->discard_bytes_saved); + + entry->offset = offset + bytes; + WARN_ON(entry->bytes < bytes + align_gap_len); + + entry->bytes -= bytes + align_gap_len; + if (!entry->bytes) + kmem_cache_free(btrfs_free_space_cachep, entry); + else + link_free_space(ctl, entry); + } +out: + btrfs_discard_update_discardable(block_group); + spin_unlock(&ctl->tree_lock); + + if (align_gap_len) + __btrfs_add_free_space(block_group, align_gap, align_gap_len, + align_gap_trim_state); + return ret; +} + +/* + * given a cluster, put all of its extents back into the free space + * cache. If a block group is passed, this function will only free + * a cluster that belongs to the passed block group. + * + * Otherwise, it'll get a reference on the block group pointed to by the + * cluster and remove the cluster from it. + */ +void btrfs_return_cluster_to_free_space( + struct btrfs_block_group *block_group, + struct btrfs_free_cluster *cluster) +{ + struct btrfs_free_space_ctl *ctl; + + /* first, get a safe pointer to the block group */ + spin_lock(&cluster->lock); + if (!block_group) { + block_group = cluster->block_group; + if (!block_group) { + spin_unlock(&cluster->lock); + return; + } + } else if (cluster->block_group != block_group) { + /* someone else has already freed it don't redo their work */ + spin_unlock(&cluster->lock); + return; + } + btrfs_get_block_group(block_group); + spin_unlock(&cluster->lock); + + ctl = block_group->free_space_ctl; + + /* now return any extents the cluster had on it */ + spin_lock(&ctl->tree_lock); + __btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&ctl->tree_lock); + + btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); + + /* finally drop our ref */ + btrfs_put_block_group(block_group); +} + +static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, + struct btrfs_free_cluster *cluster, + struct btrfs_free_space *entry, + u64 bytes, u64 min_start, + u64 *max_extent_size) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + int err; + u64 search_start = cluster->window_start; + u64 search_bytes = bytes; + u64 ret = 0; + + search_start = min_start; + search_bytes = bytes; + + err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); + if (err) { + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); + return 0; + } + + ret = search_start; + bitmap_clear_bits(ctl, entry, ret, bytes, false); + + return ret; +} + +/* + * given a cluster, try to allocate 'bytes' from it, returns 0 + * if it couldn't find anything suitably large, or a logical disk offset + * if things worked out + */ +u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start, u64 *max_extent_size) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_discard_ctl *discard_ctl = + &block_group->fs_info->discard_ctl; + struct btrfs_free_space *entry = NULL; + struct rb_node *node; + u64 ret = 0; + + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + + spin_lock(&cluster->lock); + if (bytes > cluster->max_size) + goto out; + + if (cluster->block_group != block_group) + goto out; + + node = rb_first(&cluster->root); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + while (1) { + if (entry->bytes < bytes) + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); + + if (entry->bytes < bytes || + (!entry->bitmap && entry->offset < min_start)) { + node = rb_next(&entry->offset_index); + if (!node) + break; + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } + + if (entry->bitmap) { + ret = btrfs_alloc_from_bitmap(block_group, + cluster, entry, bytes, + cluster->window_start, + max_extent_size); + if (ret == 0) { + node = rb_next(&entry->offset_index); + if (!node) + break; + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } + cluster->window_start += bytes; + } else { + ret = entry->offset; + + entry->offset += bytes; + entry->bytes -= bytes; + } + + break; + } +out: + spin_unlock(&cluster->lock); + + if (!ret) + return 0; + + spin_lock(&ctl->tree_lock); + + if (!btrfs_free_space_trimmed(entry)) + atomic64_add(bytes, &discard_ctl->discard_bytes_saved); + + ctl->free_space -= bytes; + if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) + ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; + + spin_lock(&cluster->lock); + if (entry->bytes == 0) { + rb_erase(&entry->offset_index, &cluster->root); + ctl->free_extents--; + if (entry->bitmap) { + kmem_cache_free(btrfs_free_space_bitmap_cachep, + entry->bitmap); + ctl->total_bitmaps--; + recalculate_thresholds(ctl); + } else if (!btrfs_free_space_trimmed(entry)) { + ctl->discardable_extents[BTRFS_STAT_CURR]--; + } + kmem_cache_free(btrfs_free_space_cachep, entry); + } + + spin_unlock(&cluster->lock); + spin_unlock(&ctl->tree_lock); + + return ret; +} + +static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, + struct btrfs_free_space *entry, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, + u64 cont1_bytes, u64 min_bytes) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + unsigned long next_zero; + unsigned long i; + unsigned long want_bits; + unsigned long min_bits; + unsigned long found_bits; + unsigned long max_bits = 0; + unsigned long start = 0; + unsigned long total_found = 0; + int ret; + + lockdep_assert_held(&ctl->tree_lock); + + i = offset_to_bit(entry->offset, ctl->unit, + max_t(u64, offset, entry->offset)); + want_bits = bytes_to_bits(bytes, ctl->unit); + min_bits = bytes_to_bits(min_bytes, ctl->unit); + + /* + * Don't bother looking for a cluster in this bitmap if it's heavily + * fragmented. + */ + if (entry->max_extent_size && + entry->max_extent_size < cont1_bytes) + return -ENOSPC; +again: + found_bits = 0; + for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { + next_zero = find_next_zero_bit(entry->bitmap, + BITS_PER_BITMAP, i); + if (next_zero - i >= min_bits) { + found_bits = next_zero - i; + if (found_bits > max_bits) + max_bits = found_bits; + break; + } + if (next_zero - i > max_bits) + max_bits = next_zero - i; + i = next_zero; + } + + if (!found_bits) { + entry->max_extent_size = (u64)max_bits * ctl->unit; + return -ENOSPC; + } + + if (!total_found) { + start = i; + cluster->max_size = 0; + } + + total_found += found_bits; + + if (cluster->max_size < found_bits * ctl->unit) + cluster->max_size = found_bits * ctl->unit; + + if (total_found < want_bits || cluster->max_size < cont1_bytes) { + i = next_zero + 1; + goto again; + } + + cluster->window_start = start * ctl->unit + entry->offset; + rb_erase(&entry->offset_index, &ctl->free_space_offset); + rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); + + /* + * We need to know if we're currently on the normal space index when we + * manipulate the bitmap so that we know we need to remove and re-insert + * it into the space_index tree. Clear the bytes_index node here so the + * bitmap manipulation helpers know not to mess with the space_index + * until this bitmap entry is added back into the normal cache. + */ + RB_CLEAR_NODE(&entry->bytes_index); + + ret = tree_insert_offset(ctl, cluster, entry); + ASSERT(!ret); /* -EEXIST; Logic error */ + + trace_btrfs_setup_cluster(block_group, cluster, + total_found * ctl->unit, 1); + return 0; +} + +/* + * This searches the block group for just extents to fill the cluster with. + * Try to find a cluster with at least bytes total bytes, at least one + * extent of cont1_bytes, and other clusters of at least min_bytes. + */ +static noinline int +setup_cluster_no_bitmap(struct btrfs_block_group *block_group, + struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, + u64 cont1_bytes, u64 min_bytes) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *first = NULL; + struct btrfs_free_space *entry = NULL; + struct btrfs_free_space *last; + struct rb_node *node; + u64 window_free; + u64 max_extent; + u64 total_size = 0; + + lockdep_assert_held(&ctl->tree_lock); + + entry = tree_search_offset(ctl, offset, 0, 1); + if (!entry) + return -ENOSPC; + + /* + * We don't want bitmaps, so just move along until we find a normal + * extent entry. + */ + while (entry->bitmap || entry->bytes < min_bytes) { + if (entry->bitmap && list_empty(&entry->list)) + list_add_tail(&entry->list, bitmaps); + node = rb_next(&entry->offset_index); + if (!node) + return -ENOSPC; + entry = rb_entry(node, struct btrfs_free_space, offset_index); + } + + window_free = entry->bytes; + max_extent = entry->bytes; + first = entry; + last = entry; + + for (node = rb_next(&entry->offset_index); node; + node = rb_next(&entry->offset_index)) { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + + if (entry->bitmap) { + if (list_empty(&entry->list)) + list_add_tail(&entry->list, bitmaps); + continue; + } + + if (entry->bytes < min_bytes) + continue; + + last = entry; + window_free += entry->bytes; + if (entry->bytes > max_extent) + max_extent = entry->bytes; + } + + if (window_free < bytes || max_extent < cont1_bytes) + return -ENOSPC; + + cluster->window_start = first->offset; + + node = &first->offset_index; + + /* + * now we've found our entries, pull them out of the free space + * cache and put them into the cluster rbtree + */ + do { + int ret; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + node = rb_next(&entry->offset_index); + if (entry->bitmap || entry->bytes < min_bytes) + continue; + + rb_erase(&entry->offset_index, &ctl->free_space_offset); + rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); + ret = tree_insert_offset(ctl, cluster, entry); + total_size += entry->bytes; + ASSERT(!ret); /* -EEXIST; Logic error */ + } while (node && entry != last); + + cluster->max_size = max_extent; + trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); + return 0; +} + +/* + * This specifically looks for bitmaps that may work in the cluster, we assume + * that we have already failed to find extents that will work. + */ +static noinline int +setup_cluster_bitmap(struct btrfs_block_group *block_group, + struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, + u64 cont1_bytes, u64 min_bytes) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry = NULL; + int ret = -ENOSPC; + u64 bitmap_offset = offset_to_bitmap(ctl, offset); + + if (ctl->total_bitmaps == 0) + return -ENOSPC; + + /* + * The bitmap that covers offset won't be in the list unless offset + * is just its start offset. + */ + if (!list_empty(bitmaps)) + entry = list_first_entry(bitmaps, struct btrfs_free_space, list); + + if (!entry || entry->offset != bitmap_offset) { + entry = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (entry && list_empty(&entry->list)) + list_add(&entry->list, bitmaps); + } + + list_for_each_entry(entry, bitmaps, list) { + if (entry->bytes < bytes) + continue; + ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, + bytes, cont1_bytes, min_bytes); + if (!ret) + return 0; + } + + /* + * The bitmaps list has all the bitmaps that record free space + * starting after offset, so no more search is required. + */ + return -ENOSPC; +} + +/* + * here we try to find a cluster of blocks in a block group. The goal + * is to find at least bytes+empty_size. + * We might not find them all in one contiguous area. + * + * returns zero and sets up cluster if things worked out, otherwise + * it returns -enospc + */ +int btrfs_find_space_cluster(struct btrfs_block_group *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry, *tmp; + LIST_HEAD(bitmaps); + u64 min_bytes; + u64 cont1_bytes; + int ret; + + /* + * Choose the minimum extent size we'll require for this + * cluster. For SSD_SPREAD, don't allow any fragmentation. + * For metadata, allow allocates with smaller extents. For + * data, keep it dense. + */ + if (btrfs_test_opt(fs_info, SSD_SPREAD)) { + cont1_bytes = bytes + empty_size; + min_bytes = cont1_bytes; + } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { + cont1_bytes = bytes; + min_bytes = fs_info->sectorsize; + } else { + cont1_bytes = max(bytes, (bytes + empty_size) >> 2); + min_bytes = fs_info->sectorsize; + } + + spin_lock(&ctl->tree_lock); + + /* + * If we know we don't have enough space to make a cluster don't even + * bother doing all the work to try and find one. + */ + if (ctl->free_space < bytes) { + spin_unlock(&ctl->tree_lock); + return -ENOSPC; + } + + spin_lock(&cluster->lock); + + /* someone already found a cluster, hooray */ + if (cluster->block_group) { + ret = 0; + goto out; + } + + trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, + min_bytes); + + ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, + bytes + empty_size, + cont1_bytes, min_bytes); + if (ret) + ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, + offset, bytes + empty_size, + cont1_bytes, min_bytes); + + /* Clear our temporary list */ + list_for_each_entry_safe(entry, tmp, &bitmaps, list) + list_del_init(&entry->list); + + if (!ret) { + btrfs_get_block_group(block_group); + list_add_tail(&cluster->block_group_list, + &block_group->cluster_list); + cluster->block_group = block_group; + } else { + trace_btrfs_failed_cluster_setup(block_group); + } +out: + spin_unlock(&cluster->lock); + spin_unlock(&ctl->tree_lock); + + return ret; +} + +/* + * simple code to zero out a cluster + */ +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) +{ + spin_lock_init(&cluster->lock); + spin_lock_init(&cluster->refill_lock); + cluster->root = RB_ROOT; + cluster->max_size = 0; + cluster->fragmented = false; + INIT_LIST_HEAD(&cluster->block_group_list); + cluster->block_group = NULL; +} + +static int do_trimming(struct btrfs_block_group *block_group, + u64 *total_trimmed, u64 start, u64 bytes, + u64 reserved_start, u64 reserved_bytes, + enum btrfs_trim_state reserved_trim_state, + struct btrfs_trim_range *trim_entry) +{ + struct btrfs_space_info *space_info = block_group->space_info; + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + int ret; + int update = 0; + const u64 end = start + bytes; + const u64 reserved_end = reserved_start + reserved_bytes; + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + u64 trimmed = 0; + + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (!block_group->ro) { + block_group->reserved += reserved_bytes; + space_info->bytes_reserved += reserved_bytes; + update = 1; + } + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + + ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); + if (!ret) { + *total_trimmed += trimmed; + trim_state = BTRFS_TRIM_STATE_TRIMMED; + } + + mutex_lock(&ctl->cache_writeout_mutex); + if (reserved_start < start) + __btrfs_add_free_space(block_group, reserved_start, + start - reserved_start, + reserved_trim_state); + if (end < reserved_end) + __btrfs_add_free_space(block_group, end, reserved_end - end, + reserved_trim_state); + __btrfs_add_free_space(block_group, start, bytes, trim_state); + list_del(&trim_entry->list); + mutex_unlock(&ctl->cache_writeout_mutex); + + if (update) { + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (block_group->ro) + space_info->bytes_readonly += reserved_bytes; + block_group->reserved -= reserved_bytes; + space_info->bytes_reserved -= reserved_bytes; + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + } + + return ret; +} + +/* + * If @async is set, then we will trim 1 region and return. + */ +static int trim_no_bitmap(struct btrfs_block_group *block_group, + u64 *total_trimmed, u64 start, u64 end, u64 minlen, + bool async) +{ + struct btrfs_discard_ctl *discard_ctl = + &block_group->fs_info->discard_ctl; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + struct rb_node *node; + int ret = 0; + u64 extent_start; + u64 extent_bytes; + enum btrfs_trim_state extent_trim_state; + u64 bytes; + const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); + + while (start < end) { + struct btrfs_trim_range trim_entry; + + mutex_lock(&ctl->cache_writeout_mutex); + spin_lock(&ctl->tree_lock); + + if (ctl->free_space < minlen) + goto out_unlock; + + entry = tree_search_offset(ctl, start, 0, 1); + if (!entry) + goto out_unlock; + + /* Skip bitmaps and if async, already trimmed entries */ + while (entry->bitmap || + (async && btrfs_free_space_trimmed(entry))) { + node = rb_next(&entry->offset_index); + if (!node) + goto out_unlock; + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + } + + if (entry->offset >= end) + goto out_unlock; + + extent_start = entry->offset; + extent_bytes = entry->bytes; + extent_trim_state = entry->trim_state; + if (async) { + start = entry->offset; + bytes = entry->bytes; + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + goto next; + } + unlink_free_space(ctl, entry, true); + /* + * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. + * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim + * X when we come back around. So trim it now. + */ + if (max_discard_size && + bytes >= (max_discard_size + + BTRFS_ASYNC_DISCARD_MIN_FILTER)) { + bytes = max_discard_size; + extent_bytes = max_discard_size; + entry->offset += max_discard_size; + entry->bytes -= max_discard_size; + link_free_space(ctl, entry); + } else { + kmem_cache_free(btrfs_free_space_cachep, entry); + } + } else { + start = max(start, extent_start); + bytes = min(extent_start + extent_bytes, end) - start; + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + goto next; + } + + unlink_free_space(ctl, entry, true); + kmem_cache_free(btrfs_free_space_cachep, entry); + } + + spin_unlock(&ctl->tree_lock); + trim_entry.start = extent_start; + trim_entry.bytes = extent_bytes; + list_add_tail(&trim_entry.list, &ctl->trimming_ranges); + mutex_unlock(&ctl->cache_writeout_mutex); + + ret = do_trimming(block_group, total_trimmed, start, bytes, + extent_start, extent_bytes, extent_trim_state, + &trim_entry); + if (ret) { + block_group->discard_cursor = start + bytes; + break; + } +next: + start += bytes; + block_group->discard_cursor = start; + if (async && *total_trimmed) + break; + + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } + + return ret; + +out_unlock: + block_group->discard_cursor = btrfs_block_group_end(block_group); + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + + return ret; +} + +/* + * If we break out of trimming a bitmap prematurely, we should reset the + * trimming bit. In a rather contrieved case, it's possible to race here so + * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. + * + * start = start of bitmap + * end = near end of bitmap + * + * Thread 1: Thread 2: + * trim_bitmaps(start) + * trim_bitmaps(end) + * end_trimming_bitmap() + * reset_trimming_bitmap() + */ +static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) +{ + struct btrfs_free_space *entry; + + spin_lock(&ctl->tree_lock); + entry = tree_search_offset(ctl, offset, 1, 0); + if (entry) { + if (btrfs_free_space_trimmed(entry)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += + entry->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; + } + entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + } + + spin_unlock(&ctl->tree_lock); +} + +static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *entry) +{ + if (btrfs_free_space_trimming_bitmap(entry)) { + entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; + ctl->discardable_extents[BTRFS_STAT_CURR] -= + entry->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; + } +} + +/* + * If @async is set, then we will trim 1 region and return. + */ +static int trim_bitmaps(struct btrfs_block_group *block_group, + u64 *total_trimmed, u64 start, u64 end, u64 minlen, + u64 maxlen, bool async) +{ + struct btrfs_discard_ctl *discard_ctl = + &block_group->fs_info->discard_ctl; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + int ret = 0; + int ret2; + u64 bytes; + u64 offset = offset_to_bitmap(ctl, start); + const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); + + while (offset < end) { + bool next_bitmap = false; + struct btrfs_trim_range trim_entry; + + mutex_lock(&ctl->cache_writeout_mutex); + spin_lock(&ctl->tree_lock); + + if (ctl->free_space < minlen) { + block_group->discard_cursor = + btrfs_block_group_end(block_group); + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + break; + } + + entry = tree_search_offset(ctl, offset, 1, 0); + /* + * Bitmaps are marked trimmed lossily now to prevent constant + * discarding of the same bitmap (the reason why we are bound + * by the filters). So, retrim the block group bitmaps when we + * are preparing to punt to the unused_bgs list. This uses + * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED + * which is the only discard index which sets minlen to 0. + */ + if (!entry || (async && minlen && start == offset && + btrfs_free_space_trimmed(entry))) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + next_bitmap = true; + goto next; + } + + /* + * Async discard bitmap trimming begins at by setting the start + * to be key.objectid and the offset_to_bitmap() aligns to the + * start of the bitmap. This lets us know we are fully + * scanning the bitmap rather than only some portion of it. + */ + if (start == offset) + entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; + + bytes = minlen; + ret2 = search_bitmap(ctl, entry, &start, &bytes, false); + if (ret2 || start >= end) { + /* + * We lossily consider a bitmap trimmed if we only skip + * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. + */ + if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) + end_trimming_bitmap(ctl, entry); + else + entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + next_bitmap = true; + goto next; + } + + /* + * We already trimmed a region, but are using the locking above + * to reset the trim_state. + */ + if (async && *total_trimmed) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + goto out; + } + + bytes = min(bytes, end - start); + if (bytes < minlen || (async && maxlen && bytes > maxlen)) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + goto next; + } + + /* + * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. + * If X < @minlen, we won't trim X when we come back around. + * So trim it now. We differ here from trimming extents as we + * don't keep individual state per bit. + */ + if (async && + max_discard_size && + bytes > (max_discard_size + minlen)) + bytes = max_discard_size; + + bitmap_clear_bits(ctl, entry, start, bytes, true); + if (entry->bytes == 0) + free_bitmap(ctl, entry); + + spin_unlock(&ctl->tree_lock); + trim_entry.start = start; + trim_entry.bytes = bytes; + list_add_tail(&trim_entry.list, &ctl->trimming_ranges); + mutex_unlock(&ctl->cache_writeout_mutex); + + ret = do_trimming(block_group, total_trimmed, start, bytes, + start, bytes, 0, &trim_entry); + if (ret) { + reset_trimming_bitmap(ctl, offset); + block_group->discard_cursor = + btrfs_block_group_end(block_group); + break; + } +next: + if (next_bitmap) { + offset += BITS_PER_BITMAP * ctl->unit; + start = offset; + } else { + start += bytes; + } + block_group->discard_cursor = start; + + if (fatal_signal_pending(current)) { + if (start != offset) + reset_trimming_bitmap(ctl, offset); + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } + + if (offset >= end) + block_group->discard_cursor = end; + +out: + return ret; +} + +int btrfs_trim_block_group(struct btrfs_block_group *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + int ret; + u64 rem = 0; + + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + + *trimmed = 0; + + spin_lock(&block_group->lock); + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { + spin_unlock(&block_group->lock); + return 0; + } + btrfs_freeze_block_group(block_group); + spin_unlock(&block_group->lock); + + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); + if (ret) + goto out; + + ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); + div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); + /* If we ended in the middle of a bitmap, reset the trimming flag */ + if (rem) + reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); +out: + btrfs_unfreeze_block_group(block_group); + return ret; +} + +int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen, + bool async) +{ + int ret; + + *trimmed = 0; + + spin_lock(&block_group->lock); + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { + spin_unlock(&block_group->lock); + return 0; + } + btrfs_freeze_block_group(block_group); + spin_unlock(&block_group->lock); + + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); + btrfs_unfreeze_block_group(block_group); + + return ret; +} + +int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen, + u64 maxlen, bool async) +{ + int ret; + + *trimmed = 0; + + spin_lock(&block_group->lock); + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { + spin_unlock(&block_group->lock); + return 0; + } + btrfs_freeze_block_group(block_group); + spin_unlock(&block_group->lock); + + ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, + async); + + btrfs_unfreeze_block_group(block_group); + + return ret; +} + +bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) +{ + return btrfs_super_cache_generation(fs_info->super_copy); +} + +static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans) +{ + struct btrfs_block_group *block_group; + struct rb_node *node; + int ret = 0; + + btrfs_info(fs_info, "cleaning free space cache v1"); + + node = rb_first_cached(&fs_info->block_group_cache_tree); + while (node) { + block_group = rb_entry(node, struct btrfs_block_group, cache_node); + ret = btrfs_remove_free_space_inode(trans, NULL, block_group); + if (ret) + goto out; + node = rb_next(node); + } +out: + return ret; +} + +int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) +{ + struct btrfs_trans_handle *trans; + int ret; + + /* + * update_super_roots will appropriately set or unset + * super_copy->cache_generation based on SPACE_CACHE and + * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a + * transaction commit whether we are enabling space cache v1 and don't + * have any other work to do, or are disabling it and removing free + * space inodes. + */ + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + if (!active) { + set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); + ret = cleanup_free_space_cache_v1(fs_info, trans); + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out; + } + } + + ret = btrfs_commit_transaction(trans); +out: + clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); + + return ret; +} + +int __init btrfs_free_space_init(void) +{ + btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", + sizeof(struct btrfs_free_space), 0, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_free_space_cachep) + return -ENOMEM; + + btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", + PAGE_SIZE, PAGE_SIZE, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_free_space_bitmap_cachep) { + kmem_cache_destroy(btrfs_free_space_cachep); + return -ENOMEM; + } + + return 0; +} + +void __cold btrfs_free_space_exit(void) +{ + kmem_cache_destroy(btrfs_free_space_cachep); + kmem_cache_destroy(btrfs_free_space_bitmap_cachep); +} + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +/* + * Use this if you need to make a bitmap or extent entry specifically, it + * doesn't do any of the merging that add_free_space does, this acts a lot like + * how the free space cache loading stuff works, so you can get really weird + * configurations. + */ +int test_add_free_space_entry(struct btrfs_block_group *cache, + u64 offset, u64 bytes, bool bitmap) +{ + struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; + struct btrfs_free_space *info = NULL, *bitmap_info; + void *map = NULL; + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; + u64 bytes_added; + int ret; + +again: + if (!info) { + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); + if (!info) + return -ENOMEM; + } + + if (!bitmap) { + spin_lock(&ctl->tree_lock); + info->offset = offset; + info->bytes = bytes; + info->max_extent_size = 0; + ret = link_free_space(ctl, info); + spin_unlock(&ctl->tree_lock); + if (ret) + kmem_cache_free(btrfs_free_space_cachep, info); + return ret; + } + + if (!map) { + map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); + if (!map) { + kmem_cache_free(btrfs_free_space_cachep, info); + return -ENOMEM; + } + } + + spin_lock(&ctl->tree_lock); + bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!bitmap_info) { + info->bitmap = map; + map = NULL; + add_new_bitmap(ctl, info, offset); + bitmap_info = info; + info = NULL; + } + + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, + trim_state); + + bytes -= bytes_added; + offset += bytes_added; + spin_unlock(&ctl->tree_lock); + + if (bytes) + goto again; + + if (info) + kmem_cache_free(btrfs_free_space_cachep, info); + if (map) + kmem_cache_free(btrfs_free_space_bitmap_cachep, map); + return 0; +} + +/* + * Checks to see if the given range is in the free space cache. This is really + * just used to check the absence of space, so if there is free space in the + * range at all we will return 1. + */ +int test_check_exists(struct btrfs_block_group *cache, + u64 offset, u64 bytes) +{ + struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; + struct btrfs_free_space *info; + int ret = 0; + + spin_lock(&ctl->tree_lock); + info = tree_search_offset(ctl, offset, 0, 0); + if (!info) { + info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!info) + goto out; + } + +have_info: + if (info->bitmap) { + u64 bit_off, bit_bytes; + struct rb_node *n; + struct btrfs_free_space *tmp; + + bit_off = offset; + bit_bytes = ctl->unit; + ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); + if (!ret) { + if (bit_off == offset) { + ret = 1; + goto out; + } else if (bit_off > offset && + offset + bytes > bit_off) { + ret = 1; + goto out; + } + } + + n = rb_prev(&info->offset_index); + while (n) { + tmp = rb_entry(n, struct btrfs_free_space, + offset_index); + if (tmp->offset + tmp->bytes < offset) + break; + if (offset + bytes < tmp->offset) { + n = rb_prev(&tmp->offset_index); + continue; + } + info = tmp; + goto have_info; + } + + n = rb_next(&info->offset_index); + while (n) { + tmp = rb_entry(n, struct btrfs_free_space, + offset_index); + if (offset + bytes < tmp->offset) + break; + if (tmp->offset + tmp->bytes < offset) { + n = rb_next(&tmp->offset_index); + continue; + } + info = tmp; + goto have_info; + } + + ret = 0; + goto out; + } + + if (info->offset == offset) { + ret = 1; + goto out; + } + + if (offset > info->offset && offset < info->offset + info->bytes) + ret = 1; +out: + spin_unlock(&ctl->tree_lock); + return ret; +} +#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h new file mode 100644 index 0000000000..33b4da3271 --- /dev/null +++ b/fs/btrfs/free-space-cache.h @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + */ + +#ifndef BTRFS_FREE_SPACE_CACHE_H +#define BTRFS_FREE_SPACE_CACHE_H + +/* + * This is the trim state of an extent or bitmap. + * + * BTRFS_TRIM_STATE_TRIMMING is special and used to maintain the state of a + * bitmap as we may need several trims to fully trim a single bitmap entry. + * This is reset should any free space other than trimmed space be added to the + * bitmap. + */ +enum btrfs_trim_state { + BTRFS_TRIM_STATE_UNTRIMMED, + BTRFS_TRIM_STATE_TRIMMED, + BTRFS_TRIM_STATE_TRIMMING, +}; + +struct btrfs_free_space { + struct rb_node offset_index; + struct rb_node bytes_index; + u64 offset; + u64 bytes; + u64 max_extent_size; + unsigned long *bitmap; + struct list_head list; + enum btrfs_trim_state trim_state; + s32 bitmap_extents; +}; + +static inline bool btrfs_free_space_trimmed(struct btrfs_free_space *info) +{ + return (info->trim_state == BTRFS_TRIM_STATE_TRIMMED); +} + +static inline bool btrfs_free_space_trimming_bitmap( + struct btrfs_free_space *info) +{ + return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING); +} + +/* + * Deltas are an effective way to populate global statistics. Give macro names + * to make it clear what we're doing. An example is discard_extents in + * btrfs_free_space_ctl. + */ +enum { + BTRFS_STAT_CURR, + BTRFS_STAT_PREV, + BTRFS_STAT_NR_ENTRIES, +}; + +struct btrfs_free_space_ctl { + spinlock_t tree_lock; + struct rb_root free_space_offset; + struct rb_root_cached free_space_bytes; + u64 free_space; + int extents_thresh; + int free_extents; + int total_bitmaps; + int unit; + u64 start; + s32 discardable_extents[BTRFS_STAT_NR_ENTRIES]; + s64 discardable_bytes[BTRFS_STAT_NR_ENTRIES]; + const struct btrfs_free_space_op *op; + struct btrfs_block_group *block_group; + struct mutex cache_writeout_mutex; + struct list_head trimming_ranges; +}; + +struct btrfs_free_space_op { + bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info); +}; + +struct btrfs_io_ctl { + void *cur, *orig; + struct page *page; + struct page **pages; + struct btrfs_fs_info *fs_info; + struct inode *inode; + unsigned long size; + int index; + int num_pages; + int entries; + int bitmaps; +}; + +int __init btrfs_free_space_init(void); +void __cold btrfs_free_space_exit(void); +struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, + struct btrfs_path *path); +int create_free_space_inode(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path); +int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_block_group *block_group); + +int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct inode *inode); +int load_free_space_cache(struct btrfs_block_group *block_group); +int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path); +int btrfs_write_out_cache(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path); + +void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, + struct btrfs_free_space_ctl *ctl); +int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, + u64 size, enum btrfs_trim_state trim_state); +int btrfs_add_free_space(struct btrfs_block_group *block_group, + u64 bytenr, u64 size); +int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, + u64 bytenr, u64 size); +int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, + u64 bytenr, u64 size); +int btrfs_remove_free_space(struct btrfs_block_group *block_group, + u64 bytenr, u64 size); +void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group); +bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group); +u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, + u64 offset, u64 bytes, u64 empty_size, + u64 *max_extent_size); +void btrfs_dump_free_space(struct btrfs_block_group *block_group, + u64 bytes); +int btrfs_find_space_cluster(struct btrfs_block_group *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size); +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); +u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start, u64 *max_extent_size); +void btrfs_return_cluster_to_free_space( + struct btrfs_block_group *block_group, + struct btrfs_free_cluster *cluster); +int btrfs_trim_block_group(struct btrfs_block_group *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen); +int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen, + bool async); +int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen, + u64 maxlen, bool async); + +bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info); +int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active); +/* Support functions for running our sanity tests */ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int test_add_free_space_entry(struct btrfs_block_group *cache, + u64 offset, u64 bytes, bool bitmap); +int test_check_exists(struct btrfs_block_group *cache, u64 offset, u64 bytes); +#endif + +#endif diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c new file mode 100644 index 0000000000..7b598b0707 --- /dev/null +++ b/fs/btrfs/free-space-tree.c @@ -0,0 +1,1667 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2015 Facebook. All rights reserved. + */ + +#include +#include +#include "messages.h" +#include "ctree.h" +#include "disk-io.h" +#include "locking.h" +#include "free-space-tree.h" +#include "transaction.h" +#include "block-group.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" + +static int __add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path); + +static struct btrfs_root *btrfs_free_space_root( + struct btrfs_block_group *block_group) +{ + struct btrfs_key key = { + .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + + if (btrfs_fs_incompat(block_group->fs_info, EXTENT_TREE_V2)) + key.offset = block_group->global_root_id; + return btrfs_global_root(block_group->fs_info, &key); +} + +void set_free_space_tree_thresholds(struct btrfs_block_group *cache) +{ + u32 bitmap_range; + size_t bitmap_size; + u64 num_bitmaps, total_bitmap_size; + + if (WARN_ON(cache->length == 0)) + btrfs_warn(cache->fs_info, "block group %llu length is zero", + cache->start); + + /* + * We convert to bitmaps when the disk space required for using extents + * exceeds that required for using bitmaps. + */ + bitmap_range = cache->fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; + num_bitmaps = div_u64(cache->length + bitmap_range - 1, bitmap_range); + bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE; + total_bitmap_size = num_bitmaps * bitmap_size; + cache->bitmap_high_thresh = div_u64(total_bitmap_size, + sizeof(struct btrfs_item)); + + /* + * We allow for a small buffer between the high threshold and low + * threshold to avoid thrashing back and forth between the two formats. + */ + if (cache->bitmap_high_thresh > 100) + cache->bitmap_low_thresh = cache->bitmap_high_thresh - 100; + else + cache->bitmap_low_thresh = 0; +} + +static int add_new_free_space_info(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path) +{ + struct btrfs_root *root = btrfs_free_space_root(block_group); + struct btrfs_free_space_info *info; + struct btrfs_key key; + struct extent_buffer *leaf; + int ret; + + key.objectid = block_group->start; + key.type = BTRFS_FREE_SPACE_INFO_KEY; + key.offset = block_group->length; + + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info)); + if (ret) + goto out; + + leaf = path->nodes[0]; + info = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_info); + btrfs_set_free_space_extent_count(leaf, info, 0); + btrfs_set_free_space_flags(leaf, info, 0); + btrfs_mark_buffer_dirty(trans, leaf); + + ret = 0; +out: + btrfs_release_path(path); + return ret; +} + +EXPORT_FOR_TESTS +struct btrfs_free_space_info *search_free_space_info( + struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, int cow) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *root = btrfs_free_space_root(block_group); + struct btrfs_key key; + int ret; + + key.objectid = block_group->start; + key.type = BTRFS_FREE_SPACE_INFO_KEY; + key.offset = block_group->length; + + ret = btrfs_search_slot(trans, root, &key, path, 0, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret != 0) { + btrfs_warn(fs_info, "missing free space info for %llu", + block_group->start); + ASSERT(0); + return ERR_PTR(-ENOENT); + } + + return btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_free_space_info); +} + +/* + * btrfs_search_slot() but we're looking for the greatest key less than the + * passed key. + */ +static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_path *p, + int ins_len, int cow) +{ + int ret; + + ret = btrfs_search_slot(trans, root, key, p, ins_len, cow); + if (ret < 0) + return ret; + + if (ret == 0) { + ASSERT(0); + return -EIO; + } + + if (p->slots[0] == 0) { + ASSERT(0); + return -EIO; + } + p->slots[0]--; + + return 0; +} + +static inline u32 free_space_bitmap_size(const struct btrfs_fs_info *fs_info, + u64 size) +{ + return DIV_ROUND_UP(size >> fs_info->sectorsize_bits, BITS_PER_BYTE); +} + +static unsigned long *alloc_bitmap(u32 bitmap_size) +{ + unsigned long *ret; + unsigned int nofs_flag; + u32 bitmap_rounded_size = round_up(bitmap_size, sizeof(unsigned long)); + + /* + * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse + * into the filesystem as the free space bitmap can be modified in the + * critical section of a transaction commit. + * + * TODO: push the memalloc_nofs_{save,restore}() to the caller where we + * know that recursion is unsafe. + */ + nofs_flag = memalloc_nofs_save(); + ret = kvzalloc(bitmap_rounded_size, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + return ret; +} + +static void le_bitmap_set(unsigned long *map, unsigned int start, int len) +{ + u8 *p = ((u8 *)map) + BIT_BYTE(start); + const unsigned int size = start + len; + int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE); + u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start); + + while (len - bits_to_set >= 0) { + *p |= mask_to_set; + len -= bits_to_set; + bits_to_set = BITS_PER_BYTE; + mask_to_set = ~0; + p++; + } + if (len) { + mask_to_set &= BITMAP_LAST_BYTE_MASK(size); + *p |= mask_to_set; + } +} + +EXPORT_FOR_TESTS +int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root = btrfs_free_space_root(block_group); + struct btrfs_free_space_info *info; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + unsigned long *bitmap; + char *bitmap_cursor; + u64 start, end; + u64 bitmap_range, i; + u32 bitmap_size, flags, expected_extent_count; + u32 extent_count = 0; + int done = 0, nr; + int ret; + + bitmap_size = free_space_bitmap_size(fs_info, block_group->length); + bitmap = alloc_bitmap(bitmap_size); + if (!bitmap) { + ret = -ENOMEM; + goto out; + } + + start = block_group->start; + end = block_group->start + block_group->length; + + key.objectid = end - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + while (!done) { + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + leaf = path->nodes[0]; + nr = 0; + path->slots[0]++; + while (path->slots[0] > 0) { + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); + + if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { + ASSERT(found_key.objectid == block_group->start); + ASSERT(found_key.offset == block_group->length); + done = 1; + break; + } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) { + u64 first, last; + + ASSERT(found_key.objectid >= start); + ASSERT(found_key.objectid < end); + ASSERT(found_key.objectid + found_key.offset <= end); + + first = div_u64(found_key.objectid - start, + fs_info->sectorsize); + last = div_u64(found_key.objectid + found_key.offset - start, + fs_info->sectorsize); + le_bitmap_set(bitmap, first, last - first); + + extent_count++; + nr++; + path->slots[0]--; + } else { + ASSERT(0); + } + } + + ret = btrfs_del_items(trans, root, path, path->slots[0], nr); + if (ret) + goto out; + btrfs_release_path(path); + } + + info = search_free_space_info(trans, block_group, path, 1); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + leaf = path->nodes[0]; + flags = btrfs_free_space_flags(leaf, info); + flags |= BTRFS_FREE_SPACE_USING_BITMAPS; + btrfs_set_free_space_flags(leaf, info, flags); + expected_extent_count = btrfs_free_space_extent_count(leaf, info); + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_release_path(path); + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, + "incorrect extent count for %llu; counted %u, expected %u", + block_group->start, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + bitmap_cursor = (char *)bitmap; + bitmap_range = fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; + i = start; + while (i < end) { + unsigned long ptr; + u64 extent_size; + u32 data_size; + + extent_size = min(end - i, bitmap_range); + data_size = free_space_bitmap_size(fs_info, extent_size); + + key.objectid = i; + key.type = BTRFS_FREE_SPACE_BITMAP_KEY; + key.offset = extent_size; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + data_size); + if (ret) + goto out; + + leaf = path->nodes[0]; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, bitmap_cursor, ptr, + data_size); + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_release_path(path); + + i += extent_size; + bitmap_cursor += data_size; + } + + ret = 0; +out: + kvfree(bitmap); + if (ret) + btrfs_abort_transaction(trans, ret); + return ret; +} + +EXPORT_FOR_TESTS +int convert_free_space_to_extents(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root = btrfs_free_space_root(block_group); + struct btrfs_free_space_info *info; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + unsigned long *bitmap; + u64 start, end; + u32 bitmap_size, flags, expected_extent_count; + unsigned long nrbits, start_bit, end_bit; + u32 extent_count = 0; + int done = 0, nr; + int ret; + + bitmap_size = free_space_bitmap_size(fs_info, block_group->length); + bitmap = alloc_bitmap(bitmap_size); + if (!bitmap) { + ret = -ENOMEM; + goto out; + } + + start = block_group->start; + end = block_group->start + block_group->length; + + key.objectid = end - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + while (!done) { + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + leaf = path->nodes[0]; + nr = 0; + path->slots[0]++; + while (path->slots[0] > 0) { + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); + + if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { + ASSERT(found_key.objectid == block_group->start); + ASSERT(found_key.offset == block_group->length); + done = 1; + break; + } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { + unsigned long ptr; + char *bitmap_cursor; + u32 bitmap_pos, data_size; + + ASSERT(found_key.objectid >= start); + ASSERT(found_key.objectid < end); + ASSERT(found_key.objectid + found_key.offset <= end); + + bitmap_pos = div_u64(found_key.objectid - start, + fs_info->sectorsize * + BITS_PER_BYTE); + bitmap_cursor = ((char *)bitmap) + bitmap_pos; + data_size = free_space_bitmap_size(fs_info, + found_key.offset); + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1); + read_extent_buffer(leaf, bitmap_cursor, ptr, + data_size); + + nr++; + path->slots[0]--; + } else { + ASSERT(0); + } + } + + ret = btrfs_del_items(trans, root, path, path->slots[0], nr); + if (ret) + goto out; + btrfs_release_path(path); + } + + info = search_free_space_info(trans, block_group, path, 1); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + leaf = path->nodes[0]; + flags = btrfs_free_space_flags(leaf, info); + flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS; + btrfs_set_free_space_flags(leaf, info, flags); + expected_extent_count = btrfs_free_space_extent_count(leaf, info); + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_release_path(path); + + nrbits = block_group->length >> block_group->fs_info->sectorsize_bits; + start_bit = find_next_bit_le(bitmap, nrbits, 0); + + while (start_bit < nrbits) { + end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit); + ASSERT(start_bit < end_bit); + + key.objectid = start + start_bit * block_group->fs_info->sectorsize; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + btrfs_release_path(path); + + extent_count++; + + start_bit = find_next_bit_le(bitmap, nrbits, end_bit); + } + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, + "incorrect extent count for %llu; counted %u, expected %u", + block_group->start, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + ret = 0; +out: + kvfree(bitmap); + if (ret) + btrfs_abort_transaction(trans, ret); + return ret; +} + +static int update_free_space_extent_count(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, + int new_extents) +{ + struct btrfs_free_space_info *info; + u32 flags; + u32 extent_count; + int ret = 0; + + if (new_extents == 0) + return 0; + + info = search_free_space_info(trans, block_group, path, 1); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + flags = btrfs_free_space_flags(path->nodes[0], info); + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); + + extent_count += new_extents; + btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); + btrfs_release_path(path); + + if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && + extent_count > block_group->bitmap_high_thresh) { + ret = convert_free_space_to_bitmaps(trans, block_group, path); + } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) && + extent_count < block_group->bitmap_low_thresh) { + ret = convert_free_space_to_extents(trans, block_group, path); + } + +out: + return ret; +} + +EXPORT_FOR_TESTS +int free_space_test_bit(struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 offset) +{ + struct extent_buffer *leaf; + struct btrfs_key key; + u64 found_start, found_end; + unsigned long ptr, i; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(offset >= found_start && offset < found_end); + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + i = div_u64(offset - found_start, + block_group->fs_info->sectorsize); + return !!extent_buffer_test_bit(leaf, ptr, i); +} + +static void free_space_set_bits(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 *start, u64 *size, + int bit) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct extent_buffer *leaf; + struct btrfs_key key; + u64 end = *start + *size; + u64 found_start, found_end; + unsigned long ptr, first, last; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(*start >= found_start && *start < found_end); + ASSERT(end > found_start); + + if (end > found_end) + end = found_end; + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + first = (*start - found_start) >> fs_info->sectorsize_bits; + last = (end - found_start) >> fs_info->sectorsize_bits; + if (bit) + extent_buffer_bitmap_set(leaf, ptr, first, last - first); + else + extent_buffer_bitmap_clear(leaf, ptr, first, last - first); + btrfs_mark_buffer_dirty(trans, leaf); + + *size -= end - *start; + *start = end; +} + +/* + * We can't use btrfs_next_item() in modify_free_space_bitmap() because + * btrfs_next_leaf() doesn't get the path for writing. We can forgo the fancy + * tree walking in btrfs_next_leaf() anyways because we know exactly what we're + * looking for. + */ +static int free_space_next_bitmap(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p) +{ + struct btrfs_key key; + + if (p->slots[0] + 1 < btrfs_header_nritems(p->nodes[0])) { + p->slots[0]++; + return 0; + } + + btrfs_item_key_to_cpu(p->nodes[0], &key, p->slots[0]); + btrfs_release_path(p); + + key.objectid += key.offset; + key.type = (u8)-1; + key.offset = (u64)-1; + + return btrfs_search_prev_slot(trans, root, &key, p, 0, 1); +} + +/* + * If remove is 1, then we are removing free space, thus clearing bits in the + * bitmap. If remove is 0, then we are adding free space, thus setting bits in + * the bitmap. + */ +static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, + u64 start, u64 size, int remove) +{ + struct btrfs_root *root = btrfs_free_space_root(block_group); + struct btrfs_key key; + u64 end = start + size; + u64 cur_start, cur_size; + int prev_bit, next_bit; + int new_extents; + int ret; + + /* + * Read the bit for the block immediately before the extent of space if + * that block is within the block group. + */ + if (start > block_group->start) { + u64 prev_block = start - block_group->fs_info->sectorsize; + + key.objectid = prev_block; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); + if (ret) + goto out; + + prev_bit = free_space_test_bit(block_group, path, prev_block); + + /* The previous block may have been in the previous bitmap. */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (start >= key.objectid + key.offset) { + ret = free_space_next_bitmap(trans, root, path); + if (ret) + goto out; + } + } else { + key.objectid = start; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); + if (ret) + goto out; + + prev_bit = -1; + } + + /* + * Iterate over all of the bitmaps overlapped by the extent of space, + * clearing/setting bits as required. + */ + cur_start = start; + cur_size = size; + while (1) { + free_space_set_bits(trans, block_group, path, &cur_start, &cur_size, + !remove); + if (cur_size == 0) + break; + ret = free_space_next_bitmap(trans, root, path); + if (ret) + goto out; + } + + /* + * Read the bit for the block immediately after the extent of space if + * that block is within the block group. + */ + if (end < block_group->start + block_group->length) { + /* The next block may be in the next bitmap. */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (end >= key.objectid + key.offset) { + ret = free_space_next_bitmap(trans, root, path); + if (ret) + goto out; + } + + next_bit = free_space_test_bit(block_group, path, end); + } else { + next_bit = -1; + } + + if (remove) { + new_extents = -1; + if (prev_bit == 1) { + /* Leftover on the left. */ + new_extents++; + } + if (next_bit == 1) { + /* Leftover on the right. */ + new_extents++; + } + } else { + new_extents = 1; + if (prev_bit == 1) { + /* Merging with neighbor on the left. */ + new_extents--; + } + if (next_bit == 1) { + /* Merging with neighbor on the right. */ + new_extents--; + } + } + + btrfs_release_path(path); + ret = update_free_space_extent_count(trans, block_group, path, + new_extents); + +out: + return ret; +} + +static int remove_free_space_extent(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, + u64 start, u64 size) +{ + struct btrfs_root *root = btrfs_free_space_root(block_group); + struct btrfs_key key; + u64 found_start, found_end; + u64 end = start + size; + int new_extents = -1; + int ret; + + key.objectid = start; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(start >= found_start && end <= found_end); + + /* + * Okay, now that we've found the free space extent which contains the + * free space that we are removing, there are four cases: + * + * 1. We're using the whole extent: delete the key we found and + * decrement the free space extent count. + * 2. We are using part of the extent starting at the beginning: delete + * the key we found and insert a new key representing the leftover at + * the end. There is no net change in the number of extents. + * 3. We are using part of the extent ending at the end: delete the key + * we found and insert a new key representing the leftover at the + * beginning. There is no net change in the number of extents. + * 4. We are using part of the extent in the middle: delete the key we + * found and insert two new keys representing the leftovers on each + * side. Where we used to have one extent, we now have two, so increment + * the extent count. We may need to convert the block group to bitmaps + * as a result. + */ + + /* Delete the existing key (cases 1-4). */ + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + + /* Add a key for leftovers at the beginning (cases 3 and 4). */ + if (start > found_start) { + key.objectid = found_start; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = start - found_start; + + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + new_extents++; + } + + /* Add a key for leftovers at the end (cases 2 and 4). */ + if (end < found_end) { + key.objectid = end; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = found_end - end; + + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + new_extents++; + } + + btrfs_release_path(path); + ret = update_free_space_extent_count(trans, block_group, path, + new_extents); + +out: + return ret; +} + +EXPORT_FOR_TESTS +int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 start, u64 size) +{ + struct btrfs_free_space_info *info; + u32 flags; + int ret; + + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { + ret = __add_block_group_free_space(trans, block_group, path); + if (ret) + return ret; + } + + info = search_free_space_info(NULL, block_group, path, 0); + if (IS_ERR(info)) + return PTR_ERR(info); + flags = btrfs_free_space_flags(path->nodes[0], info); + btrfs_release_path(path); + + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + return modify_free_space_bitmap(trans, block_group, path, + start, size, 1); + } else { + return remove_free_space_extent(trans, block_group, path, + start, size); + } +} + +int remove_from_free_space_tree(struct btrfs_trans_handle *trans, + u64 start, u64 size) +{ + struct btrfs_block_group *block_group; + struct btrfs_path *path; + int ret; + + if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + block_group = btrfs_lookup_block_group(trans->fs_info, start); + if (!block_group) { + ASSERT(0); + ret = -ENOENT; + goto out; + } + + mutex_lock(&block_group->free_space_lock); + ret = __remove_from_free_space_tree(trans, block_group, path, start, + size); + mutex_unlock(&block_group->free_space_lock); + + btrfs_put_block_group(block_group); +out: + btrfs_free_path(path); + if (ret) + btrfs_abort_transaction(trans, ret); + return ret; +} + +static int add_free_space_extent(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, + u64 start, u64 size) +{ + struct btrfs_root *root = btrfs_free_space_root(block_group); + struct btrfs_key key, new_key; + u64 found_start, found_end; + u64 end = start + size; + int new_extents = 1; + int ret; + + /* + * We are adding a new extent of free space, but we need to merge + * extents. There are four cases here: + * + * 1. The new extent does not have any immediate neighbors to merge + * with: add the new key and increment the free space extent count. We + * may need to convert the block group to bitmaps as a result. + * 2. The new extent has an immediate neighbor before it: remove the + * previous key and insert a new key combining both of them. There is no + * net change in the number of extents. + * 3. The new extent has an immediate neighbor after it: remove the next + * key and insert a new key combining both of them. There is no net + * change in the number of extents. + * 4. The new extent has immediate neighbors on both sides: remove both + * of the keys and insert a new key combining all of them. Where we used + * to have two extents, we now have one, so decrement the extent count. + */ + + new_key.objectid = start; + new_key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + new_key.offset = size; + + /* Search for a neighbor on the left. */ + if (start == block_group->start) + goto right; + key.objectid = start - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) { + ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY); + btrfs_release_path(path); + goto right; + } + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(found_start >= block_group->start && + found_end > block_group->start); + ASSERT(found_start < start && found_end <= start); + + /* + * Delete the neighbor on the left and absorb it into the new key (cases + * 2 and 4). + */ + if (found_end == start) { + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + new_key.objectid = found_start; + new_key.offset += key.offset; + new_extents--; + } + btrfs_release_path(path); + +right: + /* Search for a neighbor on the right. */ + if (end == block_group->start + block_group->length) + goto insert; + key.objectid = end; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) { + ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY); + btrfs_release_path(path); + goto insert; + } + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(found_start >= block_group->start && + found_end > block_group->start); + ASSERT((found_start < start && found_end <= start) || + (found_start >= end && found_end > end)); + + /* + * Delete the neighbor on the right and absorb it into the new key + * (cases 3 and 4). + */ + if (found_start == end) { + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + new_key.offset += key.offset; + new_extents--; + } + btrfs_release_path(path); + +insert: + /* Insert the new key (cases 1-4). */ + ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0); + if (ret) + goto out; + + btrfs_release_path(path); + ret = update_free_space_extent_count(trans, block_group, path, + new_extents); + +out: + return ret; +} + +EXPORT_FOR_TESTS +int __add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 start, u64 size) +{ + struct btrfs_free_space_info *info; + u32 flags; + int ret; + + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { + ret = __add_block_group_free_space(trans, block_group, path); + if (ret) + return ret; + } + + info = search_free_space_info(NULL, block_group, path, 0); + if (IS_ERR(info)) + return PTR_ERR(info); + flags = btrfs_free_space_flags(path->nodes[0], info); + btrfs_release_path(path); + + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + return modify_free_space_bitmap(trans, block_group, path, + start, size, 0); + } else { + return add_free_space_extent(trans, block_group, path, start, + size); + } +} + +int add_to_free_space_tree(struct btrfs_trans_handle *trans, + u64 start, u64 size) +{ + struct btrfs_block_group *block_group; + struct btrfs_path *path; + int ret; + + if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + block_group = btrfs_lookup_block_group(trans->fs_info, start); + if (!block_group) { + ASSERT(0); + ret = -ENOENT; + goto out; + } + + mutex_lock(&block_group->free_space_lock); + ret = __add_to_free_space_tree(trans, block_group, path, start, size); + mutex_unlock(&block_group->free_space_lock); + + btrfs_put_block_group(block_group); +out: + btrfs_free_path(path); + if (ret) + btrfs_abort_transaction(trans, ret); + return ret; +} + +/* + * Populate the free space tree by walking the extent tree. Operations on the + * extent tree that happen as a result of writes to the free space tree will go + * through the normal add/remove hooks. + */ +static int populate_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group) +{ + struct btrfs_root *extent_root; + struct btrfs_path *path, *path2; + struct btrfs_key key; + u64 start, end; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = READA_FORWARD; + + path2 = btrfs_alloc_path(); + if (!path2) { + btrfs_free_path(path); + return -ENOMEM; + } + + ret = add_new_free_space_info(trans, block_group, path2); + if (ret) + goto out; + + mutex_lock(&block_group->free_space_lock); + + /* + * Iterate through all of the extent and metadata items in this block + * group, adding the free space between them and the free space at the + * end. Note that EXTENT_ITEM and METADATA_ITEM are less than + * BLOCK_GROUP_ITEM, so an extent may precede the block group that it's + * contained in. + */ + key.objectid = block_group->start; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; + + extent_root = btrfs_extent_root(trans->fs_info, key.objectid); + ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0); + if (ret < 0) + goto out_locked; + ASSERT(ret == 0); + + start = block_group->start; + end = block_group->start + block_group->length; + while (1) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { + if (key.objectid >= end) + break; + + if (start < key.objectid) { + ret = __add_to_free_space_tree(trans, + block_group, + path2, start, + key.objectid - + start); + if (ret) + goto out_locked; + } + start = key.objectid; + if (key.type == BTRFS_METADATA_ITEM_KEY) + start += trans->fs_info->nodesize; + else + start += key.offset; + } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { + if (key.objectid != block_group->start) + break; + } + + ret = btrfs_next_item(extent_root, path); + if (ret < 0) + goto out_locked; + if (ret) + break; + } + if (start < end) { + ret = __add_to_free_space_tree(trans, block_group, path2, + start, end - start); + if (ret) + goto out_locked; + } + + ret = 0; +out_locked: + mutex_unlock(&block_group->free_space_lock); +out: + btrfs_free_path(path2); + btrfs_free_path(path); + return ret; +} + +int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *free_space_root; + struct btrfs_block_group *block_group; + struct rb_node *node; + int ret; + + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); + set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); + free_space_root = btrfs_create_tree(trans, + BTRFS_FREE_SPACE_TREE_OBJECTID); + if (IS_ERR(free_space_root)) { + ret = PTR_ERR(free_space_root); + goto abort; + } + ret = btrfs_global_root_insert(free_space_root); + if (ret) { + btrfs_put_root(free_space_root); + goto abort; + } + + node = rb_first_cached(&fs_info->block_group_cache_tree); + while (node) { + block_group = rb_entry(node, struct btrfs_block_group, + cache_node); + ret = populate_free_space_tree(trans, block_group); + if (ret) + goto abort; + node = rb_next(node); + } + + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); + clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); + ret = btrfs_commit_transaction(trans); + + /* + * Now that we've committed the transaction any reading of our commit + * root will be safe, so we can cache from the free space tree now. + */ + clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); + return ret; + +abort: + clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); + clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; +} + +static int clear_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + int nr; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = 0; + key.type = 0; + key.offset = 0; + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + nr = btrfs_header_nritems(path->nodes[0]); + if (!nr) + break; + + path->slots[0] = 0; + ret = btrfs_del_items(trans, root, path, 0, nr); + if (ret) + goto out; + + btrfs_release_path(path); + } + + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_key key = { + .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key); + int ret; + + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE); + btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); + + ret = clear_free_space_tree(trans, free_space_root); + if (ret) + goto abort; + + ret = btrfs_del_root(trans, &free_space_root->root_key); + if (ret) + goto abort; + + btrfs_global_root_delete(free_space_root); + + spin_lock(&fs_info->trans_lock); + list_del(&free_space_root->dirty_list); + spin_unlock(&fs_info->trans_lock); + + btrfs_tree_lock(free_space_root->node); + btrfs_clear_buffer_dirty(trans, free_space_root->node); + btrfs_tree_unlock(free_space_root->node); + btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), + free_space_root->node, 0, 1); + + btrfs_put_root(free_space_root); + + return btrfs_commit_transaction(trans); + +abort: + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; +} + +int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_key key = { + .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key); + struct rb_node *node; + int ret; + + trans = btrfs_start_transaction(free_space_root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); + set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); + + ret = clear_free_space_tree(trans, free_space_root); + if (ret) + goto abort; + + node = rb_first_cached(&fs_info->block_group_cache_tree); + while (node) { + struct btrfs_block_group *block_group; + + block_group = rb_entry(node, struct btrfs_block_group, + cache_node); + ret = populate_free_space_tree(trans, block_group); + if (ret) + goto abort; + node = rb_next(node); + } + + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); + clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); + + ret = btrfs_commit_transaction(trans); + clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); + return ret; +abort: + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; +} + +static int __add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path) +{ + int ret; + + clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags); + + ret = add_new_free_space_info(trans, block_group, path); + if (ret) + return ret; + + return __add_to_free_space_tree(trans, block_group, path, + block_group->start, + block_group->length); +} + +int add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_path *path = NULL; + int ret = 0; + + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + + mutex_lock(&block_group->free_space_lock); + if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) + goto out; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ret = __add_block_group_free_space(trans, block_group, path); + +out: + btrfs_free_path(path); + mutex_unlock(&block_group->free_space_lock); + if (ret) + btrfs_abort_transaction(trans, ret); + return ret; +} + +int remove_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group) +{ + struct btrfs_root *root = btrfs_free_space_root(block_group); + struct btrfs_path *path; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + u64 start, end; + int done = 0, nr; + int ret; + + if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) + return 0; + + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { + /* We never added this block group to the free space tree. */ + return 0; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + start = block_group->start; + end = block_group->start + block_group->length; + + key.objectid = end - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + while (!done) { + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + leaf = path->nodes[0]; + nr = 0; + path->slots[0]++; + while (path->slots[0] > 0) { + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); + + if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { + ASSERT(found_key.objectid == block_group->start); + ASSERT(found_key.offset == block_group->length); + done = 1; + nr++; + path->slots[0]--; + break; + } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY || + found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { + ASSERT(found_key.objectid >= start); + ASSERT(found_key.objectid < end); + ASSERT(found_key.objectid + found_key.offset <= end); + nr++; + path->slots[0]--; + } else { + ASSERT(0); + } + } + + ret = btrfs_del_items(trans, root, path, path->slots[0], nr); + if (ret) + goto out; + btrfs_release_path(path); + } + + ret = 0; +out: + btrfs_free_path(path); + if (ret) + btrfs_abort_transaction(trans, ret); + return ret; +} + +static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, + struct btrfs_path *path, + u32 expected_extent_count) +{ + struct btrfs_block_group *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_key key; + int prev_bit = 0, bit; + /* Initialize to silence GCC. */ + u64 extent_start = 0; + u64 end, offset; + u64 total_found = 0; + u32 extent_count = 0; + int ret; + + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + root = btrfs_free_space_root(block_group); + + end = block_group->start + block_group->length; + + while (1) { + ret = btrfs_next_item(root, path); + if (ret < 0) + goto out; + if (ret) + break; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type == BTRFS_FREE_SPACE_INFO_KEY) + break; + + ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); + ASSERT(key.objectid < end && key.objectid + key.offset <= end); + + offset = key.objectid; + while (offset < key.objectid + key.offset) { + bit = free_space_test_bit(block_group, path, offset); + if (prev_bit == 0 && bit == 1) { + extent_start = offset; + } else if (prev_bit == 1 && bit == 0) { + u64 space_added; + + ret = btrfs_add_new_free_space(block_group, + extent_start, + offset, + &space_added); + if (ret) + goto out; + total_found += space_added; + if (total_found > CACHING_CTL_WAKE_UP) { + total_found = 0; + wake_up(&caching_ctl->wait); + } + extent_count++; + } + prev_bit = bit; + offset += fs_info->sectorsize; + } + } + if (prev_bit == 1) { + ret = btrfs_add_new_free_space(block_group, extent_start, end, NULL); + if (ret) + goto out; + extent_count++; + } + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, + "incorrect extent count for %llu; counted %u, expected %u", + block_group->start, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + ret = 0; +out: + return ret; +} + +static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, + struct btrfs_path *path, + u32 expected_extent_count) +{ + struct btrfs_block_group *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_key key; + u64 end; + u64 total_found = 0; + u32 extent_count = 0; + int ret; + + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + root = btrfs_free_space_root(block_group); + + end = block_group->start + block_group->length; + + while (1) { + u64 space_added; + + ret = btrfs_next_item(root, path); + if (ret < 0) + goto out; + if (ret) + break; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type == BTRFS_FREE_SPACE_INFO_KEY) + break; + + ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); + ASSERT(key.objectid < end && key.objectid + key.offset <= end); + + ret = btrfs_add_new_free_space(block_group, key.objectid, + key.objectid + key.offset, + &space_added); + if (ret) + goto out; + total_found += space_added; + if (total_found > CACHING_CTL_WAKE_UP) { + total_found = 0; + wake_up(&caching_ctl->wait); + } + extent_count++; + } + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, + "incorrect extent count for %llu; counted %u, expected %u", + block_group->start, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + ret = 0; +out: + return ret; +} + +int load_free_space_tree(struct btrfs_caching_control *caching_ctl) +{ + struct btrfs_block_group *block_group; + struct btrfs_free_space_info *info; + struct btrfs_path *path; + u32 extent_count, flags; + int ret; + + block_group = caching_ctl->block_group; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Just like caching_thread() doesn't want to deadlock on the extent + * tree, we don't want to deadlock on the free space tree. + */ + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = READA_FORWARD; + + info = search_free_space_info(NULL, block_group, path, 0); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); + flags = btrfs_free_space_flags(path->nodes[0], info); + + /* + * We left path pointing to the free space info item, so now + * load_free_space_foo can just iterate through the free space tree from + * there. + */ + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) + ret = load_free_space_bitmaps(caching_ctl, path, extent_count); + else + ret = load_free_space_extents(caching_ctl, path, extent_count); + +out: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h new file mode 100644 index 0000000000..6d5551d0ce --- /dev/null +++ b/fs/btrfs/free-space-tree.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2015 Facebook. All rights reserved. + */ + +#ifndef BTRFS_FREE_SPACE_TREE_H +#define BTRFS_FREE_SPACE_TREE_H + +struct btrfs_caching_control; + +/* + * The default size for new free space bitmap items. The last bitmap in a block + * group may be truncated, and none of the free space tree code assumes that + * existing bitmaps are this size. + */ +#define BTRFS_FREE_SPACE_BITMAP_SIZE 256 +#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE) + +void set_free_space_tree_thresholds(struct btrfs_block_group *block_group); +int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info); +int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info); +int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info); +int load_free_space_tree(struct btrfs_caching_control *caching_ctl); +int add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group); +int remove_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group); +int add_to_free_space_tree(struct btrfs_trans_handle *trans, + u64 start, u64 size); +int remove_from_free_space_tree(struct btrfs_trans_handle *trans, + u64 start, u64 size); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct btrfs_free_space_info * +search_free_space_info(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, int cow); +int __add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 start, u64 size); +int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 start, u64 size); +int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path); +int convert_free_space_to_extents(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path); +int free_space_test_bit(struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 offset); +#endif + +#endif diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c new file mode 100644 index 0000000000..31c1648bc0 --- /dev/null +++ b/fs/btrfs/fs.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "messages.h" +#include "ctree.h" +#include "fs.h" +#include "accessors.h" + +void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_incompat_flags(disk_super, features); + btrfs_info(fs_info, + "setting incompat feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); + } +} + +void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_incompat_flags(disk_super, features); + btrfs_info(fs_info, + "clearing incompat feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); + } +} + +void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, + "setting compat-ro feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); + } +} + +void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, + "clearing compat-ro feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); + } +} diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h new file mode 100644 index 0000000000..a523d64d54 --- /dev/null +++ b/fs/btrfs/fs.h @@ -0,0 +1,999 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_FS_H +#define BTRFS_FS_H + +#include +#include +#include +#include +#include "extent-io-tree.h" +#include "extent_map.h" +#include "async-thread.h" +#include "block-rsv.h" + +#define BTRFS_MAX_EXTENT_SIZE SZ_128M + +#define BTRFS_OLDEST_GENERATION 0ULL + +#define BTRFS_EMPTY_DIR_SIZE 0 + +#define BTRFS_DIRTY_METADATA_THRESH SZ_32M + +#define BTRFS_SUPER_INFO_OFFSET SZ_64K +#define BTRFS_SUPER_INFO_SIZE 4096 +static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); + +/* + * Number of metadata items necessary for an unlink operation: + * + * 1 for the possible orphan item + * 1 for the dir item + * 1 for the dir index + * 1 for the inode ref + * 1 for the inode + * 1 for the parent inode + */ +#define BTRFS_UNLINK_METADATA_UNITS 6 + +/* + * The reserved space at the beginning of each device. It covers the primary + * super block and leaves space for potential use by other tools like + * bootloaders or to lower potential damage of accidental overwrite. + */ +#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) +/* + * Runtime (in-memory) states of filesystem + */ +enum { + /* + * Filesystem is being remounted, allow to skip some operations, like + * defrag + */ + BTRFS_FS_STATE_REMOUNTING, + /* Filesystem in RO mode */ + BTRFS_FS_STATE_RO, + /* Track if a transaction abort has been reported on this filesystem */ + BTRFS_FS_STATE_TRANS_ABORTED, + /* + * Bio operations should be blocked on this filesystem because a source + * or target device is being destroyed as part of a device replace + */ + BTRFS_FS_STATE_DEV_REPLACING, + /* The btrfs_fs_info created for self-tests */ + BTRFS_FS_STATE_DUMMY_FS_INFO, + + BTRFS_FS_STATE_NO_CSUMS, + + /* Indicates there was an error cleaning up a log tree. */ + BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + + BTRFS_FS_STATE_COUNT +}; + +enum { + BTRFS_FS_CLOSING_START, + BTRFS_FS_CLOSING_DONE, + BTRFS_FS_LOG_RECOVERING, + BTRFS_FS_OPEN, + BTRFS_FS_QUOTA_ENABLED, + BTRFS_FS_UPDATE_UUID_TREE_GEN, + BTRFS_FS_CREATING_FREE_SPACE_TREE, + BTRFS_FS_BTREE_ERR, + BTRFS_FS_LOG1_ERR, + BTRFS_FS_LOG2_ERR, + BTRFS_FS_QUOTA_OVERRIDE, + /* Used to record internally whether fs has been frozen */ + BTRFS_FS_FROZEN, + /* + * Indicate that balance has been set up from the ioctl and is in the + * main phase. The fs_info::balance_ctl is initialized. + */ + BTRFS_FS_BALANCE_RUNNING, + + /* + * Indicate that relocation of a chunk has started, it's set per chunk + * and is toggled between chunks. + */ + BTRFS_FS_RELOC_RUNNING, + + /* Indicate that the cleaner thread is awake and doing something. */ + BTRFS_FS_CLEANER_RUNNING, + + /* + * The checksumming has an optimized version and is considered fast, + * so we don't need to offload checksums to workqueues. + */ + BTRFS_FS_CSUM_IMPL_FAST, + + /* Indicate that the discard workqueue can service discards. */ + BTRFS_FS_DISCARD_RUNNING, + + /* Indicate that we need to cleanup space cache v1 */ + BTRFS_FS_CLEANUP_SPACE_CACHE_V1, + + /* Indicate that we can't trust the free space tree for caching yet */ + BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, + + /* Indicate whether there are any tree modification log users */ + BTRFS_FS_TREE_MOD_LOG_USERS, + + /* Indicate that we want the transaction kthread to commit right now. */ + BTRFS_FS_COMMIT_TRANS, + + /* Indicate we have half completed snapshot deletions pending. */ + BTRFS_FS_UNFINISHED_DROPS, + + /* Indicate we have to finish a zone to do next allocation. */ + BTRFS_FS_NEED_ZONE_FINISH, + + /* Indicate that we want to commit the transaction. */ + BTRFS_FS_NEED_TRANS_COMMIT, + + /* This is set when active zone tracking is needed. */ + BTRFS_FS_ACTIVE_ZONE_TRACKING, + + /* + * Indicate if we have some features changed, this is mostly for + * cleaner thread to update the sysfs interface. + */ + BTRFS_FS_FEATURE_CHANGED, + +#if BITS_PER_LONG == 32 + /* Indicate if we have error/warn message printed on 32bit systems */ + BTRFS_FS_32BIT_ERROR, + BTRFS_FS_32BIT_WARN, +#endif +}; + +/* + * Flags for mount options. + * + * Note: don't forget to add new options to btrfs_show_options() + */ +enum { + BTRFS_MOUNT_NODATASUM = (1UL << 0), + BTRFS_MOUNT_NODATACOW = (1UL << 1), + BTRFS_MOUNT_NOBARRIER = (1UL << 2), + BTRFS_MOUNT_SSD = (1UL << 3), + BTRFS_MOUNT_DEGRADED = (1UL << 4), + BTRFS_MOUNT_COMPRESS = (1UL << 5), + BTRFS_MOUNT_NOTREELOG = (1UL << 6), + BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), + BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), + BTRFS_MOUNT_NOSSD = (1UL << 9), + BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), + BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), + BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), + BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), + BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), + BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), + BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), + BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), + BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), + BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), + BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), + BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), + BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), + BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), + BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), + BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), + BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), + BTRFS_MOUNT_REF_VERIFY = (1UL << 27), + BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), + BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), + BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), + BTRFS_MOUNT_NODISCARD = (1UL << 31), +}; + +/* + * Compat flags that we support. If any incompat flags are set other than the + * ones specified below then we will fail to mount + */ +#define BTRFS_FEATURE_COMPAT_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL + +#define BTRFS_FEATURE_COMPAT_RO_SUPP \ + (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ + BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ + BTRFS_FEATURE_COMPAT_RO_VERITY | \ + BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE) + +#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL + +#define BTRFS_FEATURE_INCOMPAT_SUPP_STABLE \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED) + +#ifdef CONFIG_BTRFS_DEBUG + /* + * Features under developmen like Extent tree v2 support is enabled + * only under CONFIG_BTRFS_DEBUG. + */ +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ + BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) + +#else + +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE) + +#endif + +#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ + (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) +#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL + +#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) +#define BTRFS_DEFAULT_MAX_INLINE (2048) + +struct btrfs_dev_replace { + /* See #define above */ + u64 replace_state; + /* Seconds since 1-Jan-1970 */ + time64_t time_started; + /* Seconds since 1-Jan-1970 */ + time64_t time_stopped; + atomic64_t num_write_errors; + atomic64_t num_uncorrectable_read_errors; + + u64 cursor_left; + u64 committed_cursor_left; + u64 cursor_left_last_write_of_item; + u64 cursor_right; + + /* See #define above */ + u64 cont_reading_from_srcdev_mode; + + int is_valid; + int item_needs_writeback; + struct btrfs_device *srcdev; + struct btrfs_device *tgtdev; + + struct mutex lock_finishing_cancel_unmount; + struct rw_semaphore rwsem; + + struct btrfs_scrub_progress scrub_progress; + + struct percpu_counter bio_counter; + wait_queue_head_t replace_wait; +}; + +/* + * Free clusters are used to claim free space in relatively large chunks, + * allowing us to do less seeky writes. They are used for all metadata + * allocations. In ssd_spread mode they are also used for data allocations. + */ +struct btrfs_free_cluster { + spinlock_t lock; + spinlock_t refill_lock; + struct rb_root root; + + /* Largest extent in this cluster */ + u64 max_size; + + /* First extent starting offset */ + u64 window_start; + + /* We did a full search and couldn't create a cluster */ + bool fragmented; + + struct btrfs_block_group *block_group; + /* + * When a cluster is allocated from a block group, we put the cluster + * onto a list in the block group so that it can be freed before the + * block group is freed. + */ + struct list_head block_group_list; +}; + +/* Discard control. */ +/* + * Async discard uses multiple lists to differentiate the discard filter + * parameters. Index 0 is for completely free block groups where we need to + * ensure the entire block group is trimmed without being lossy. Indices + * afterwards represent monotonically decreasing discard filter sizes to + * prioritize what should be discarded next. + */ +#define BTRFS_NR_DISCARD_LISTS 3 +#define BTRFS_DISCARD_INDEX_UNUSED 0 +#define BTRFS_DISCARD_INDEX_START 1 + +struct btrfs_discard_ctl { + struct workqueue_struct *discard_workers; + struct delayed_work work; + spinlock_t lock; + struct btrfs_block_group *block_group; + struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; + u64 prev_discard; + u64 prev_discard_time; + atomic_t discardable_extents; + atomic64_t discardable_bytes; + u64 max_discard_size; + u64 delay_ms; + u32 iops_limit; + u32 kbps_limit; + u64 discard_extent_bytes; + u64 discard_bitmap_bytes; + atomic64_t discard_bytes_saved; +}; + +/* + * Exclusive operations (device replace, resize, device add/remove, balance) + */ +enum btrfs_exclusive_operation { + BTRFS_EXCLOP_NONE, + BTRFS_EXCLOP_BALANCE_PAUSED, + BTRFS_EXCLOP_BALANCE, + BTRFS_EXCLOP_DEV_ADD, + BTRFS_EXCLOP_DEV_REMOVE, + BTRFS_EXCLOP_DEV_REPLACE, + BTRFS_EXCLOP_RESIZE, + BTRFS_EXCLOP_SWAP_ACTIVATE, +}; + +/* Store data about transaction commits, exported via sysfs. */ +struct btrfs_commit_stats { + /* Total number of commits */ + u64 commit_count; + /* The maximum commit duration so far in ns */ + u64 max_commit_dur; + /* The last commit duration in ns */ + u64 last_commit_dur; + /* The total commit duration in ns */ + u64 total_commit_dur; +}; + +struct btrfs_fs_info { + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + unsigned long flags; + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; + struct btrfs_root *fs_root; + struct btrfs_root *quota_root; + struct btrfs_root *uuid_root; + struct btrfs_root *data_reloc_root; + struct btrfs_root *block_group_root; + + /* The log root tree is a directory of all the other log roots */ + struct btrfs_root *log_root_tree; + + /* The tree that holds the global roots (csum, extent, etc) */ + rwlock_t global_root_lock; + struct rb_root global_root_tree; + + spinlock_t fs_roots_radix_lock; + struct radix_tree_root fs_roots_radix; + + /* Block group cache stuff */ + rwlock_t block_group_cache_lock; + struct rb_root_cached block_group_cache_tree; + + /* Keep track of unallocated space */ + atomic64_t free_chunk_space; + + /* Track ranges which are used by log trees blocks/logged data extents */ + struct extent_io_tree excluded_extents; + + /* logical->physical extent mapping */ + struct extent_map_tree mapping_tree; + + /* + * Block reservation for extent, checksum, root tree and delayed dir + * index item. + */ + struct btrfs_block_rsv global_block_rsv; + /* Block reservation for metadata operations */ + struct btrfs_block_rsv trans_block_rsv; + /* Block reservation for chunk tree */ + struct btrfs_block_rsv chunk_block_rsv; + /* Block reservation for delayed operations */ + struct btrfs_block_rsv delayed_block_rsv; + /* Block reservation for delayed refs */ + struct btrfs_block_rsv delayed_refs_rsv; + + struct btrfs_block_rsv empty_block_rsv; + + u64 generation; + u64 last_trans_committed; + /* + * Generation of the last transaction used for block group relocation + * since the filesystem was last mounted (or 0 if none happened yet). + * Must be written and read while holding btrfs_fs_info::commit_root_sem. + */ + u64 last_reloc_trans; + + /* + * This is updated to the current trans every time a full commit is + * required instead of the faster short fsync log commits + */ + u64 last_trans_log_full_commit; + unsigned long mount_opt; + + unsigned long compress_type:4; + unsigned int compress_level; + u32 commit_interval; + /* + * It is a suggestive number, the read side is safe even it gets a + * wrong number because we will write out the data into a regular + * extent. The write side(mount/remount) is under ->s_umount lock, + * so it is also safe. + */ + u64 max_inline; + + struct btrfs_transaction *running_transaction; + wait_queue_head_t transaction_throttle; + wait_queue_head_t transaction_wait; + wait_queue_head_t transaction_blocked_wait; + wait_queue_head_t async_submit_wait; + + /* + * Used to protect the incompat_flags, compat_flags, compat_ro_flags + * when they are updated. + * + * Because we do not clear the flags for ever, so we needn't use + * the lock on the read side. + * + * We also needn't use the lock when we mount the fs, because + * there is no other task which will update the flag. + */ + spinlock_t super_lock; + struct btrfs_super_block *super_copy; + struct btrfs_super_block *super_for_commit; + struct super_block *sb; + struct inode *btree_inode; + struct mutex tree_log_mutex; + struct mutex transaction_kthread_mutex; + struct mutex cleaner_mutex; + struct mutex chunk_mutex; + + /* + * This is taken to make sure we don't set block groups ro after the + * free space cache has been allocated on them. + */ + struct mutex ro_block_group_mutex; + + /* + * This is used during read/modify/write to make sure no two ios are + * trying to mod the same stripe at the same time. + */ + struct btrfs_stripe_hash_table *stripe_hash_table; + + /* + * This protects the ordered operations list only while we are + * processing all of the entries on it. This way we make sure the + * commit code doesn't find the list temporarily empty because another + * function happens to be doing non-waiting preflush before jumping + * into the main commit. + */ + struct mutex ordered_operations_mutex; + + struct rw_semaphore commit_root_sem; + + struct rw_semaphore cleanup_work_sem; + + struct rw_semaphore subvol_sem; + + spinlock_t trans_lock; + /* + * The reloc mutex goes with the trans lock, it is taken during commit + * to protect us from the relocation code. + */ + struct mutex reloc_mutex; + + struct list_head trans_list; + struct list_head dead_roots; + struct list_head caching_block_groups; + + spinlock_t delayed_iput_lock; + struct list_head delayed_iputs; + atomic_t nr_delayed_iputs; + wait_queue_head_t delayed_iputs_wait; + + atomic64_t tree_mod_seq; + + /* This protects tree_mod_log and tree_mod_seq_list */ + rwlock_t tree_mod_log_lock; + struct rb_root tree_mod_log; + struct list_head tree_mod_seq_list; + + atomic_t async_delalloc_pages; + + /* This is used to protect the following list -- ordered_roots. */ + spinlock_t ordered_root_lock; + + /* + * All fs/file tree roots in which there are data=ordered extents + * pending writeback are added into this list. + * + * These can span multiple transactions and basically include every + * dirty data page that isn't from nodatacow. + */ + struct list_head ordered_roots; + + struct mutex delalloc_root_mutex; + spinlock_t delalloc_root_lock; + /* All fs/file tree roots that have delalloc inodes. */ + struct list_head delalloc_roots; + + /* + * There is a pool of worker threads for checksumming during writes and + * a pool for checksumming after reads. This is because readers can + * run with FS locks held, and the writers may be waiting for those + * locks. We don't want ordering in the pending list to cause + * deadlocks, and so the two are serviced separately. + * + * A third pool does submit_bio to avoid deadlocking with the other two. + */ + struct btrfs_workqueue *workers; + struct btrfs_workqueue *delalloc_workers; + struct btrfs_workqueue *flush_workers; + struct workqueue_struct *endio_workers; + struct workqueue_struct *endio_meta_workers; + struct workqueue_struct *rmw_workers; + struct workqueue_struct *compressed_write_workers; + struct btrfs_workqueue *endio_write_workers; + struct btrfs_workqueue *endio_freespace_worker; + struct btrfs_workqueue *caching_workers; + + /* + * Fixup workers take dirty pages that didn't properly go through the + * cow mechanism and make them safe to write. It happens for the + * sys_munmap function call path. + */ + struct btrfs_workqueue *fixup_workers; + struct btrfs_workqueue *delayed_workers; + + struct task_struct *transaction_kthread; + struct task_struct *cleaner_kthread; + u32 thread_pool_size; + + struct kobject *space_info_kobj; + struct kobject *qgroups_kobj; + struct kobject *discard_kobj; + + /* Used to keep from writing metadata until there is a nice batch */ + struct percpu_counter dirty_metadata_bytes; + struct percpu_counter delalloc_bytes; + struct percpu_counter ordered_bytes; + s32 dirty_metadata_batch; + s32 delalloc_batch; + + /* Protected by 'trans_lock'. */ + struct list_head dirty_cowonly_roots; + + struct btrfs_fs_devices *fs_devices; + + /* + * The space_info list is effectively read only after initial setup. + * It is populated at mount time and cleaned up after all block groups + * are removed. RCU is used to protect it. + */ + struct list_head space_info; + + struct btrfs_space_info *data_sinfo; + + struct reloc_control *reloc_ctl; + + /* data_alloc_cluster is only used in ssd_spread mode */ + struct btrfs_free_cluster data_alloc_cluster; + + /* All metadata allocations go through this cluster. */ + struct btrfs_free_cluster meta_alloc_cluster; + + /* Auto defrag inodes go here. */ + spinlock_t defrag_inodes_lock; + struct rb_root defrag_inodes; + atomic_t defrag_running; + + /* Used to protect avail_{data, metadata, system}_alloc_bits */ + seqlock_t profiles_lock; + /* + * These three are in extended format (availability of single chunks is + * denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other types are denoted + * by corresponding BTRFS_BLOCK_GROUP_* bits) + */ + u64 avail_data_alloc_bits; + u64 avail_metadata_alloc_bits; + u64 avail_system_alloc_bits; + + /* Balance state */ + spinlock_t balance_lock; + struct mutex balance_mutex; + atomic_t balance_pause_req; + atomic_t balance_cancel_req; + struct btrfs_balance_control *balance_ctl; + wait_queue_head_t balance_wait_q; + + /* Cancellation requests for chunk relocation */ + atomic_t reloc_cancel_req; + + u32 data_chunk_allocations; + u32 metadata_ratio; + + void *bdev_holder; + + /* Private scrub information */ + struct mutex scrub_lock; + atomic_t scrubs_running; + atomic_t scrub_pause_req; + atomic_t scrubs_paused; + atomic_t scrub_cancel_req; + wait_queue_head_t scrub_pause_wait; + /* + * The worker pointers are NULL iff the refcount is 0, ie. scrub is not + * running. + */ + refcount_t scrub_workers_refcnt; + struct workqueue_struct *scrub_workers; + struct btrfs_subpage_info *subpage_info; + + struct btrfs_discard_ctl discard_ctl; + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + u32 check_integrity_print_mask; +#endif + /* Is qgroup tracking in a consistent state? */ + u64 qgroup_flags; + + /* Holds configuration and tracking. Protected by qgroup_lock. */ + struct rb_root qgroup_tree; + spinlock_t qgroup_lock; + + /* + * Used to avoid frequently calling ulist_alloc()/ulist_free() + * when doing qgroup accounting, it must be protected by qgroup_lock. + */ + struct ulist *qgroup_ulist; + + /* + * Protect user change for quota operations. If a transaction is needed, + * it must be started before locking this lock. + */ + struct mutex qgroup_ioctl_lock; + + /* List of dirty qgroups to be written at next commit. */ + struct list_head dirty_qgroups; + + /* Used by qgroup for an efficient tree traversal. */ + u64 qgroup_seq; + + /* Qgroup rescan items. */ + /* Protects the progress item */ + struct mutex qgroup_rescan_lock; + struct btrfs_key qgroup_rescan_progress; + struct btrfs_workqueue *qgroup_rescan_workers; + struct completion qgroup_rescan_completion; + struct btrfs_work qgroup_rescan_work; + /* Protected by qgroup_rescan_lock */ + bool qgroup_rescan_running; + u8 qgroup_drop_subtree_thres; + + /* + * If this is not 0, then it indicates a serious filesystem error has + * happened and it contains that error (negative errno value). + */ + int fs_error; + + /* Filesystem state */ + unsigned long fs_state; + + struct btrfs_delayed_root *delayed_root; + + /* Extent buffer radix tree */ + spinlock_t buffer_lock; + /* Entries are eb->start / sectorsize */ + struct radix_tree_root buffer_radix; + + /* Next backup root to be overwritten */ + int backup_root_index; + + /* Device replace state */ + struct btrfs_dev_replace dev_replace; + + struct semaphore uuid_tree_rescan_sem; + + /* Used to reclaim the metadata space in the background. */ + struct work_struct async_reclaim_work; + struct work_struct async_data_reclaim_work; + struct work_struct preempt_reclaim_work; + + /* Reclaim partially filled block groups in the background */ + struct work_struct reclaim_bgs_work; + struct list_head reclaim_bgs; + int bg_reclaim_threshold; + + spinlock_t unused_bgs_lock; + struct list_head unused_bgs; + struct mutex unused_bg_unpin_mutex; + /* Protect block groups that are going to be deleted */ + struct mutex reclaim_bgs_lock; + + /* Cached block sizes */ + u32 nodesize; + u32 sectorsize; + /* ilog2 of sectorsize, use to avoid 64bit division */ + u32 sectorsize_bits; + u32 csum_size; + u32 csums_per_leaf; + u32 stripesize; + + /* + * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular + * filesystem, on zoned it depends on the device constraints. + */ + u64 max_extent_size; + + /* Block groups and devices containing active swapfiles. */ + spinlock_t swapfile_pins_lock; + struct rb_root swapfile_pins; + + struct crypto_shash *csum_shash; + + /* Type of exclusive operation running, protected by super_lock */ + enum btrfs_exclusive_operation exclusive_operation; + + /* + * Zone size > 0 when in ZONED mode, otherwise it's used for a check + * if the mode is enabled + */ + u64 zone_size; + + /* Constraints for ZONE_APPEND commands: */ + struct queue_limits limits; + u64 max_zone_append_size; + + struct mutex zoned_meta_io_lock; + spinlock_t treelog_bg_lock; + u64 treelog_bg; + + /* + * Start of the dedicated data relocation block group, protected by + * relocation_bg_lock. + */ + spinlock_t relocation_bg_lock; + u64 data_reloc_bg; + struct mutex zoned_data_reloc_io_lock; + + struct btrfs_block_group *active_meta_bg; + struct btrfs_block_group *active_system_bg; + + u64 nr_global_roots; + + spinlock_t zone_active_bgs_lock; + struct list_head zone_active_bgs; + + /* Updates are not protected by any lock */ + struct btrfs_commit_stats commit_stats; + + /* + * Last generation where we dropped a non-relocation root. + * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen() + * to change it and to read it, respectively. + */ + u64 last_root_drop_gen; + + /* + * Annotations for transaction events (structures are empty when + * compiled without lockdep). + */ + struct lockdep_map btrfs_trans_num_writers_map; + struct lockdep_map btrfs_trans_num_extwriters_map; + struct lockdep_map btrfs_state_change_map[4]; + struct lockdep_map btrfs_trans_pending_ordered_map; + struct lockdep_map btrfs_ordered_extent_map; + +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + spinlock_t ref_verify_lock; + struct rb_root block_tree; +#endif + +#ifdef CONFIG_BTRFS_DEBUG + struct kobject *debug_kobj; + struct list_head allocated_roots; + + spinlock_t eb_leak_lock; + struct list_head allocated_ebs; +#endif +}; + +static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, + u64 gen) +{ + WRITE_ONCE(fs_info->last_root_drop_gen, gen); +} + +static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info) +{ + return READ_ONCE(fs_info->last_root_drop_gen); +} + +/* + * Take the number of bytes to be checksummed and figure out how many leaves + * it would require to store the csums for that many bytes. + */ +static inline u64 btrfs_csum_bytes_to_leaves( + const struct btrfs_fs_info *fs_info, u64 csum_bytes) +{ + const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; + + return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); +} + +/* + * Use this if we would be adding new items, as we could split nodes as we cow + * down the tree. + */ +static inline u64 btrfs_calc_insert_metadata_size(const struct btrfs_fs_info *fs_info, + unsigned num_items) +{ + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; +} + +/* + * Doing a truncate or a modification won't result in new nodes or leaves, just + * what we need for COW. + */ +static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info, + unsigned num_items) +{ + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; +} + +#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ + sizeof(struct btrfs_item)) + +static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) +{ + return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0; +} + +/* + * Count how many fs_info->max_extent_size cover the @size + */ +static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) +{ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (!fs_info) + return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); +#endif + + return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); +} + +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op); + +/* Compatibility and incompatibility defines */ +void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); +void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); +void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); +void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); + +#define __btrfs_fs_incompat(fs_info, flags) \ + (!!(btrfs_super_incompat_flags((fs_info)->super_copy) & (flags))) + +#define __btrfs_fs_compat_ro(fs_info, flags) \ + (!!(btrfs_super_compat_ro_flags((fs_info)->super_copy) & (flags))) + +#define btrfs_set_fs_incompat(__fs_info, opt) \ + __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) + +#define btrfs_clear_fs_incompat(__fs_info, opt) \ + __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) + +#define btrfs_fs_incompat(fs_info, opt) \ + __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + +#define btrfs_set_fs_compat_ro(__fs_info, opt) \ + __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) + +#define btrfs_clear_fs_compat_ro(__fs_info, opt) \ + __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) + +#define btrfs_fs_compat_ro(fs_info, opt) \ + __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + +#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) +#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) +#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) +#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ + BTRFS_MOUNT_##opt) + +#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ +do { \ + if (!btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_set_opt(fs_info->mount_opt, opt); \ +} while (0) + +#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ +do { \ + if (btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_clear_opt(fs_info->mount_opt, opt); \ +} while (0) + +static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) +{ + /* Do it this way so we only ever do one test_bit in the normal case. */ + if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { + if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) + return 2; + return 1; + } + return 0; +} + +/* + * If we remount the fs to be R/O or umount the fs, the cleaner needn't do + * anything except sleeping. This function is used to check the status of + * the fs. + * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount, + * since setting and checking for SB_RDONLY in the superblock's flags is not + * atomic. + */ +static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) +{ + return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) || + btrfs_fs_closing(fs_info); +} + +static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); +} + +#define BTRFS_FS_ERROR(fs_info) (READ_ONCE((fs_info)->fs_error)) + +#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ + (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ + &(fs_info)->fs_state))) + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + +#define EXPORT_FOR_TESTS + +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) +{ + return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); +} + +void btrfs_test_destroy_inode(struct inode *inode); + +#else + +#define EXPORT_FOR_TESTS static + +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) +{ + return 0; +} +#endif + +#endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c new file mode 100644 index 0000000000..d3ff97374d --- /dev/null +++ b/fs/btrfs/inode-item.c @@ -0,0 +1,751 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include "ctree.h" +#include "fs.h" +#include "messages.h" +#include "inode-item.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "space-info.h" +#include "accessors.h" +#include "extent-tree.h" +#include "file-item.h" + +struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, + int slot, + const struct fscrypt_str *name) +{ + struct btrfs_inode_ref *ref; + unsigned long ptr; + unsigned long name_ptr; + u32 item_size; + u32 cur_offset = 0; + int len; + + item_size = btrfs_item_size(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); + while (cur_offset < item_size) { + ref = (struct btrfs_inode_ref *)(ptr + cur_offset); + len = btrfs_inode_ref_name_len(leaf, ref); + name_ptr = (unsigned long)(ref + 1); + cur_offset += len + sizeof(*ref); + if (len != name->len) + continue; + if (memcmp_extent_buffer(leaf, name->name, name_ptr, + name->len) == 0) + return ref; + } + return NULL; +} + +struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( + struct extent_buffer *leaf, int slot, u64 ref_objectid, + const struct fscrypt_str *name) +{ + struct btrfs_inode_extref *extref; + unsigned long ptr; + unsigned long name_ptr; + u32 item_size; + u32 cur_offset = 0; + int ref_name_len; + + item_size = btrfs_item_size(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); + + /* + * Search all extended backrefs in this item. We're only + * looking through any collisions so most of the time this is + * just going to compare against one buffer. If all is well, + * we'll return success and the inode ref object. + */ + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *) (ptr + cur_offset); + name_ptr = (unsigned long)(&extref->name); + ref_name_len = btrfs_inode_extref_name_len(leaf, extref); + + if (ref_name_len == name->len && + btrfs_inode_extref_parent(leaf, extref) == ref_objectid && + (memcmp_extent_buffer(leaf, name->name, name_ptr, + name->len) == 0)) + return extref; + + cur_offset += ref_name_len + sizeof(*extref); + } + return NULL; +} + +/* Returns NULL if no extref found */ +struct btrfs_inode_extref * +btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid, int ins_len, + int cow) +{ + int ret; + struct btrfs_key key; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return NULL; + return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], + ref_objectid, name); + +} + +static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid, + u64 *index) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + int ret; + int del_len = name->len + sizeof(*extref); + unsigned long ptr; + unsigned long item_start; + u32 item_size; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + goto out; + + /* + * Sanity check - did we find the right item for this name? + * This should always succeed so error here will make the FS + * readonly. + */ + extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], + ref_objectid, name); + if (!extref) { + btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); + ret = -EROFS; + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size(leaf, path->slots[0]); + if (index) + *index = btrfs_inode_extref_index(leaf, extref); + + if (del_len == item_size) { + /* + * Common case only one ref in the item, remove the + * whole item. + */ + ret = btrfs_del_item(trans, root, path); + goto out; + } + + ptr = (unsigned long)extref; + item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); + + memmove_extent_buffer(leaf, ptr, ptr + del_len, + item_size - (ptr + del_len - item_start)); + + btrfs_truncate_item(trans, path, item_size - del_len, 1); + +out: + btrfs_free_path(path); + + return ret; +} + +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid, u64 *index) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_ref *ref; + struct extent_buffer *leaf; + unsigned long ptr; + unsigned long item_start; + u32 item_size; + u32 sub_item_len; + int ret; + int search_ext_refs = 0; + int del_len = name->len + sizeof(*ref); + + key.objectid = inode_objectid; + key.offset = ref_objectid; + key.type = BTRFS_INODE_REF_KEY; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = -ENOENT; + search_ext_refs = 1; + goto out; + } else if (ret < 0) { + goto out; + } + + ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name); + if (!ref) { + ret = -ENOENT; + search_ext_refs = 1; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size(leaf, path->slots[0]); + + if (index) + *index = btrfs_inode_ref_index(leaf, ref); + + if (del_len == item_size) { + ret = btrfs_del_item(trans, root, path); + goto out; + } + ptr = (unsigned long)ref; + sub_item_len = name->len + sizeof(*ref); + item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); + memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, + item_size - (ptr + sub_item_len - item_start)); + btrfs_truncate_item(trans, path, item_size - sub_item_len, 1); +out: + btrfs_free_path(path); + + if (search_ext_refs) { + /* + * No refs were found, or we could not find the + * name in our ref array. Find and remove the extended + * inode ref then. + */ + return btrfs_del_inode_extref(trans, root, name, + inode_objectid, ref_objectid, index); + } + + return ret; +} + +/* + * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree. + * + * The caller must have checked against BTRFS_LINK_MAX already. + */ +static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid, + u64 index) +{ + struct btrfs_inode_extref *extref; + int ret; + int ins_len = name->len + sizeof(*extref); + unsigned long ptr; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + ins_len); + if (ret == -EEXIST) { + if (btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], + ref_objectid, + name)) + goto out; + + btrfs_extend_item(trans, path, ins_len); + ret = 0; + } + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char); + ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len; + extref = (struct btrfs_inode_extref *)ptr; + + btrfs_set_inode_extref_name_len(path->nodes[0], extref, name->len); + btrfs_set_inode_extref_index(path->nodes[0], extref, index); + btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid); + + ptr = (unsigned long)&extref->name; + write_extent_buffer(path->nodes[0], name->name, ptr, name->len); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); + +out: + btrfs_free_path(path); + return ret; +} + +/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ +int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid, u64 index) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_ref *ref; + unsigned long ptr; + int ret; + int ins_len = name->len + sizeof(*ref); + + key.objectid = inode_objectid; + key.offset = ref_objectid; + key.type = BTRFS_INODE_REF_KEY; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->skip_release_on_error = 1; + ret = btrfs_insert_empty_item(trans, root, path, &key, + ins_len); + if (ret == -EEXIST) { + u32 old_size; + ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], + name); + if (ref) + goto out; + + old_size = btrfs_item_size(path->nodes[0], path->slots[0]); + btrfs_extend_item(trans, path, ins_len); + ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len); + btrfs_set_inode_ref_index(path->nodes[0], ref, index); + ptr = (unsigned long)(ref + 1); + ret = 0; + } else if (ret < 0) { + if (ret == -EOVERFLOW) { + if (btrfs_find_name_in_backref(path->nodes[0], + path->slots[0], + name)) + ret = -EEXIST; + else + ret = -EMLINK; + } + goto out; + } else { + ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len); + btrfs_set_inode_ref_index(path->nodes[0], ref, index); + ptr = (unsigned long)(ref + 1); + } + write_extent_buffer(path->nodes[0], name->name, ptr, name->len); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); + +out: + btrfs_free_path(path); + + if (ret == -EMLINK) { + struct btrfs_super_block *disk_super = fs_info->super_copy; + /* We ran out of space in the ref array. Need to + * add an extended ref. */ + if (btrfs_super_incompat_flags(disk_super) + & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) + ret = btrfs_insert_inode_extref(trans, root, name, + inode_objectid, + ref_objectid, index); + } + + return ret; +} + +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid) +{ + struct btrfs_key key; + int ret; + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_inode_item)); + return ret; +} + +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + struct btrfs_key *location, int mod) +{ + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + int ret; + int slot; + struct extent_buffer *leaf; + struct btrfs_key found_key; + + ret = btrfs_search_slot(trans, root, location, path, ins_len, cow); + if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY && + location->offset == (u64)-1 && path->slots[0] != 0) { + slot = path->slots[0] - 1; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid == location->objectid && + found_key.type == location->type) { + path->slots[0]--; + return 0; + } + } + return ret; +} + +static inline void btrfs_trace_truncate(struct btrfs_inode *inode, + struct extent_buffer *leaf, + struct btrfs_file_extent_item *fi, + u64 offset, int extent_type, int slot) +{ + if (!inode) + return; + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + trace_btrfs_truncate_show_fi_inline(inode, leaf, fi, slot, + offset); + else + trace_btrfs_truncate_show_fi_regular(inode, leaf, fi, offset); +} + +/* + * Remove inode items from a given root. + * + * @trans: A transaction handle. + * @root: The root from which to remove items. + * @inode: The inode whose items we want to remove. + * @control: The btrfs_truncate_control to control how and what we + * are truncating. + * + * Remove all keys associated with the inode from the given root that have a key + * with a type greater than or equals to @min_type. When @min_type has a value of + * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value + * greater than or equals to @new_size. If a file extent item that starts before + * @new_size and ends after it is found, its length is adjusted. + * + * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is + * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block. + */ +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_truncate_control *control) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key found_key; + u64 new_size = control->new_size; + u64 extent_num_bytes = 0; + u64 extent_offset = 0; + u64 item_end = 0; + u32 found_type = (u8)-1; + int del_item; + int pending_del_nr = 0; + int pending_del_slot = 0; + int extent_type = -1; + int ret; + u64 bytes_deleted = 0; + bool be_nice = false; + + ASSERT(control->inode || !control->clear_extent_range); + ASSERT(new_size == 0 || control->min_type == BTRFS_EXTENT_DATA_KEY); + + control->last_size = new_size; + control->sub_bytes = 0; + + /* + * For shareable roots we want to back off from time to time, this turns + * out to be subvolume roots, reloc roots, and data reloc roots. + */ + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + be_nice = true; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = READA_BACK; + + key.objectid = control->ino; + key.offset = (u64)-1; + key.type = (u8)-1; + +search_again: + /* + * With a 16K leaf size and 128MiB extents, you can actually queue up a + * huge file in a single leaf. Most of the time that bytes_deleted is + * > 0, it will be huge by the time we get here + */ + if (be_nice && bytes_deleted > SZ_32M && + btrfs_should_end_transaction(trans)) { + ret = -EAGAIN; + goto out; + } + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = 0; + /* There are no items in the tree for us to truncate, we're done */ + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + + while (1) { + u64 clear_start = 0, clear_len = 0, extent_start = 0; + bool refill_delayed_refs_rsv = false; + + fi = NULL; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = found_key.type; + + if (found_key.objectid != control->ino) + break; + + if (found_type < control->min_type) + break; + + item_end = found_key.offset; + if (found_type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + if (extent_type != BTRFS_FILE_EXTENT_INLINE) + item_end += + btrfs_file_extent_num_bytes(leaf, fi); + else if (extent_type == BTRFS_FILE_EXTENT_INLINE) + item_end += btrfs_file_extent_ram_bytes(leaf, fi); + + btrfs_trace_truncate(control->inode, leaf, fi, + found_key.offset, extent_type, + path->slots[0]); + item_end--; + } + if (found_type > control->min_type) { + del_item = 1; + } else { + if (item_end < new_size) + break; + if (found_key.offset >= new_size) + del_item = 1; + else + del_item = 0; + } + + /* FIXME, shrink the extent if the ref count is only 1 */ + if (found_type != BTRFS_EXTENT_DATA_KEY) + goto delete; + + control->extents_found++; + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + u64 num_dec; + + clear_start = found_key.offset; + extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); + if (!del_item) { + u64 orig_num_bytes = + btrfs_file_extent_num_bytes(leaf, fi); + extent_num_bytes = ALIGN(new_size - + found_key.offset, + fs_info->sectorsize); + clear_start = ALIGN(new_size, fs_info->sectorsize); + + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_num_bytes); + num_dec = (orig_num_bytes - extent_num_bytes); + if (extent_start != 0) + control->sub_bytes += num_dec; + btrfs_mark_buffer_dirty(trans, leaf); + } else { + extent_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, fi); + extent_offset = found_key.offset - + btrfs_file_extent_offset(leaf, fi); + + /* FIXME blocksize != 4096 */ + num_dec = btrfs_file_extent_num_bytes(leaf, fi); + if (extent_start != 0) + control->sub_bytes += num_dec; + } + clear_len = num_dec; + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + /* + * We can't truncate inline items that have had + * special encodings + */ + if (!del_item && + btrfs_file_extent_encryption(leaf, fi) == 0 && + btrfs_file_extent_other_encoding(leaf, fi) == 0 && + btrfs_file_extent_compression(leaf, fi) == 0) { + u32 size = (u32)(new_size - found_key.offset); + + btrfs_set_file_extent_ram_bytes(leaf, fi, size); + size = btrfs_file_extent_calc_inline_size(size); + btrfs_truncate_item(trans, path, size, 1); + } else if (!del_item) { + /* + * We have to bail so the last_size is set to + * just before this extent. + */ + ret = BTRFS_NEED_TRUNCATE_BLOCK; + break; + } else { + /* + * Inline extents are special, we just treat + * them as a full sector worth in the file + * extent tree just for simplicity sake. + */ + clear_len = fs_info->sectorsize; + } + + control->sub_bytes += item_end + 1 - new_size; + } +delete: + /* + * We only want to clear the file extent range if we're + * modifying the actual inode's mapping, which is just the + * normal truncate path. + */ + if (control->clear_extent_range) { + ret = btrfs_inode_clear_file_extent_range(control->inode, + clear_start, clear_len); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + } + + if (del_item) { + ASSERT(!pending_del_nr || + ((path->slots[0] + 1) == pending_del_slot)); + + control->last_size = found_key.offset; + if (!pending_del_nr) { + /* No pending yet, add ourselves */ + pending_del_slot = path->slots[0]; + pending_del_nr = 1; + } else if (path->slots[0] + 1 == pending_del_slot) { + /* Hop on the pending chunk */ + pending_del_nr++; + pending_del_slot = path->slots[0]; + } + } else { + control->last_size = new_size; + break; + } + + if (del_item && extent_start != 0 && !control->skip_ref_updates) { + struct btrfs_ref ref = { 0 }; + + bytes_deleted += extent_num_bytes; + + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, + extent_start, extent_num_bytes, 0); + btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), + control->ino, extent_offset, + root->root_key.objectid, false); + ret = btrfs_free_extent(trans, &ref); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + if (be_nice && btrfs_check_space_for_delayed_refs(fs_info)) + refill_delayed_refs_rsv = true; + } + + if (found_type == BTRFS_INODE_ITEM_KEY) + break; + + if (path->slots[0] == 0 || + path->slots[0] != pending_del_slot || + refill_delayed_refs_rsv) { + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, + pending_del_slot, + pending_del_nr); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + pending_del_nr = 0; + } + btrfs_release_path(path); + + /* + * We can generate a lot of delayed refs, so we need to + * throttle every once and a while and make sure we're + * adding enough space to keep up with the work we are + * generating. Since we hold a transaction here we + * can't flush, and we don't want to FLUSH_LIMIT because + * we could have generated too many delayed refs to + * actually allocate, so just bail if we're short and + * let the normal reservation dance happen higher up. + */ + if (refill_delayed_refs_rsv) { + ret = btrfs_delayed_refs_rsv_refill(fs_info, + BTRFS_RESERVE_NO_FLUSH); + if (ret) { + ret = -EAGAIN; + break; + } + } + goto search_again; + } else { + path->slots[0]--; + } + } +out: + if (ret >= 0 && pending_del_nr) { + int err; + + err = btrfs_del_items(trans, root, path, pending_del_slot, + pending_del_nr); + if (err) { + btrfs_abort_transaction(trans, err); + ret = err; + } + } + + ASSERT(control->last_size >= new_size); + if (!ret && control->last_size > new_size) + control->last_size = new_size; + + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h new file mode 100644 index 0000000000..ede43b6c65 --- /dev/null +++ b/fs/btrfs/inode-item.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_INODE_ITEM_H +#define BTRFS_INODE_ITEM_H + +#include + +struct btrfs_trans_handle; +struct btrfs_root; +struct btrfs_path; +struct btrfs_key; +struct btrfs_inode_extref; +struct btrfs_inode; +struct extent_buffer; + +/* + * Return this if we need to call truncate_block for the last bit of the + * truncate. + */ +#define BTRFS_NEED_TRUNCATE_BLOCK 1 + +struct btrfs_truncate_control { + /* + * IN: the inode we're operating on, this can be NULL if + * ->clear_extent_range is false. + */ + struct btrfs_inode *inode; + + /* IN: the size we're truncating to. */ + u64 new_size; + + /* OUT: the number of extents truncated. */ + u64 extents_found; + + /* OUT: the last size we truncated this inode to. */ + u64 last_size; + + /* OUT: the number of bytes to sub from this inode. */ + u64 sub_bytes; + + /* IN: the ino we are truncating. */ + u64 ino; + + /* + * IN: minimum key type to remove. All key types with this type are + * removed only if their offset >= new_size. + */ + u32 min_type; + + /* + * IN: true if we don't want to do extent reference updates for any file + * extents we drop. + */ + bool skip_ref_updates; + + /* + * IN: true if we need to clear the file extent range for the inode as + * we drop the file extent items. + */ + bool clear_extent_range; +}; + +/* + * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two + * separate u32s. These two functions convert between the two representations. + */ +static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags) +{ + return (flags | ((u64)ro_flags << 32)); +} + +static inline void btrfs_inode_split_flags(u64 inode_item_flags, + u32 *flags, u32 *ro_flags) +{ + *flags = (u32)inode_item_flags; + *ro_flags = (u32)(inode_item_flags >> 32); +} + +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_truncate_control *control); +int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid, u64 index); +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid, u64 *index); +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *location, int mod); + +struct btrfs_inode_extref *btrfs_lookup_inode_extref( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid, int ins_len, + int cow); + +struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, + int slot, + const struct fscrypt_str *name); +struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( + struct extent_buffer *leaf, int slot, u64 ref_objectid, + const struct fscrypt_str *name); + +#endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c new file mode 100644 index 0000000000..f250e2083c --- /dev/null +++ b/fs/btrfs/inode.c @@ -0,0 +1,10964 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "misc.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "ordered-data.h" +#include "xattr.h" +#include "tree-log.h" +#include "bio.h" +#include "compression.h" +#include "locking.h" +#include "free-space-cache.h" +#include "props.h" +#include "qgroup.h" +#include "delalloc-space.h" +#include "block-group.h" +#include "space-info.h" +#include "zoned.h" +#include "subpage.h" +#include "inode-item.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "defrag.h" +#include "dir-item.h" +#include "file-item.h" +#include "uuid-tree.h" +#include "ioctl.h" +#include "file.h" +#include "acl.h" +#include "relocation.h" +#include "verity.h" +#include "super.h" +#include "orphan.h" +#include "backref.h" + +struct btrfs_iget_args { + u64 ino; + struct btrfs_root *root; +}; + +struct btrfs_dio_data { + ssize_t submitted; + struct extent_changeset *data_reserved; + struct btrfs_ordered_extent *ordered; + bool data_space_reserved; + bool nocow_done; +}; + +struct btrfs_dio_private { + /* Range of I/O */ + u64 file_offset; + u32 bytes; + + /* This must be last */ + struct btrfs_bio bbio; +}; + +static struct bio_set btrfs_dio_bioset; + +struct btrfs_rename_ctx { + /* Output field. Stores the index number of the old directory entry. */ + u64 index; +}; + +/* + * Used by data_reloc_print_warning_inode() to pass needed info for filename + * resolution and output of error message. + */ +struct data_reloc_warn { + struct btrfs_path path; + struct btrfs_fs_info *fs_info; + u64 extent_item_size; + u64 logical; + int mirror_num; +}; + +static const struct inode_operations btrfs_dir_inode_operations; +static const struct inode_operations btrfs_symlink_inode_operations; +static const struct inode_operations btrfs_special_inode_operations; +static const struct inode_operations btrfs_file_inode_operations; +static const struct address_space_operations btrfs_aops; +static const struct file_operations btrfs_dir_file_operations; + +static struct kmem_cache *btrfs_inode_cachep; + +static int btrfs_setsize(struct inode *inode, struct iattr *attr); +static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); + +static noinline int run_delalloc_cow(struct btrfs_inode *inode, + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc, + bool pages_dirty); +static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, + u64 len, u64 orig_start, u64 block_start, + u64 block_len, u64 orig_block_len, + u64 ram_bytes, int compress_type, + int type); + +static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, + u64 root, void *warn_ctx) +{ + struct data_reloc_warn *warn = warn_ctx; + struct btrfs_fs_info *fs_info = warn->fs_info; + struct extent_buffer *eb; + struct btrfs_inode_item *inode_item; + struct inode_fs_paths *ipath = NULL; + struct btrfs_root *local_root; + struct btrfs_key key; + unsigned int nofs_flag; + u32 nlink; + int ret; + + local_root = btrfs_get_fs_root(fs_info, root, true); + if (IS_ERR(local_root)) { + ret = PTR_ERR(local_root); + goto err; + } + + /* This makes the path point to (inum INODE_ITEM ioff). */ + key.objectid = inum; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0); + if (ret) { + btrfs_put_root(local_root); + btrfs_release_path(&warn->path); + goto err; + } + + eb = warn->path.nodes[0]; + inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item); + nlink = btrfs_inode_nlink(eb, inode_item); + btrfs_release_path(&warn->path); + + nofs_flag = memalloc_nofs_save(); + ipath = init_ipath(4096, local_root, &warn->path); + memalloc_nofs_restore(nofs_flag); + if (IS_ERR(ipath)) { + btrfs_put_root(local_root); + ret = PTR_ERR(ipath); + ipath = NULL; + /* + * -ENOMEM, not a critical error, just output an generic error + * without filename. + */ + btrfs_warn(fs_info, +"checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu", + warn->logical, warn->mirror_num, root, inum, offset); + return ret; + } + ret = paths_from_inode(inum, ipath); + if (ret < 0) + goto err; + + /* + * We deliberately ignore the bit ipath might have been too small to + * hold all of the paths here + */ + for (int i = 0; i < ipath->fspath->elem_cnt; i++) { + btrfs_warn(fs_info, +"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)", + warn->logical, warn->mirror_num, root, inum, offset, + fs_info->sectorsize, nlink, + (char *)(unsigned long)ipath->fspath->val[i]); + } + + btrfs_put_root(local_root); + free_ipath(ipath); + return 0; + +err: + btrfs_warn(fs_info, +"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d", + warn->logical, warn->mirror_num, root, inum, offset, ret); + + free_ipath(ipath); + return ret; +} + +/* + * Do extra user-friendly error output (e.g. lookup all the affected files). + * + * Return true if we succeeded doing the backref lookup. + * Return false if such lookup failed, and has to fallback to the old error message. + */ +static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off, + const u8 *csum, const u8 *csum_expected, + int mirror_num) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_path path = { 0 }; + struct btrfs_key found_key = { 0 }; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + const u32 csum_size = fs_info->csum_size; + u64 logical; + u64 flags; + u32 item_size; + int ret; + + mutex_lock(&fs_info->reloc_mutex); + logical = btrfs_get_reloc_bg_bytenr(fs_info); + mutex_unlock(&fs_info->reloc_mutex); + + if (logical == U64_MAX) { + btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); + btrfs_warn_rl(fs_info, +"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + inode->root->root_key.objectid, btrfs_ino(inode), file_off, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + return; + } + + logical += file_off; + btrfs_warn_rl(fs_info, +"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + inode->root->root_key.objectid, + btrfs_ino(inode), file_off, logical, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + + ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags); + if (ret < 0) { + btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d", + logical, ret); + return; + } + eb = path.nodes[0]; + ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size(eb, path.slots[0]); + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + unsigned long ptr = 0; + u64 ref_root; + u8 ref_level; + + while (true) { + ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, + item_size, &ref_root, + &ref_level); + if (ret < 0) { + btrfs_warn_rl(fs_info, + "failed to resolve tree backref for logical %llu: %d", + logical, ret); + break; + } + if (ret > 0) + break; + + btrfs_warn_rl(fs_info, +"csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu", + logical, mirror_num, + (ref_level ? "node" : "leaf"), + ref_level, ref_root); + } + btrfs_release_path(&path); + } else { + struct btrfs_backref_walk_ctx ctx = { 0 }; + struct data_reloc_warn reloc_warn = { 0 }; + + btrfs_release_path(&path); + + ctx.bytenr = found_key.objectid; + ctx.extent_item_pos = logical - found_key.objectid; + ctx.fs_info = fs_info; + + reloc_warn.logical = logical; + reloc_warn.extent_item_size = found_key.offset; + reloc_warn.mirror_num = mirror_num; + reloc_warn.fs_info = fs_info; + + iterate_extent_inodes(&ctx, true, + data_reloc_print_warning_inode, &reloc_warn); + } +} + +static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, + u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) +{ + struct btrfs_root *root = inode->root; + const u32 csum_size = root->fs_info->csum_size; + + /* For data reloc tree, it's better to do a backref lookup instead. */ + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + return print_data_reloc_error(inode, logical_start, csum, + csum_expected, mirror_num); + + /* Output without objectid, which is more meaningful */ + if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) { + btrfs_warn_rl(root->fs_info, +"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + root->root_key.objectid, btrfs_ino(inode), + logical_start, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + } else { + btrfs_warn_rl(root->fs_info, +"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + root->root_key.objectid, btrfs_ino(inode), + logical_start, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + } +} + +/* + * btrfs_inode_lock - lock inode i_rwsem based on arguments passed + * + * ilock_flags can have the following bit set: + * + * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode + * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt + * return -EAGAIN + * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock + */ +int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) +{ + if (ilock_flags & BTRFS_ILOCK_SHARED) { + if (ilock_flags & BTRFS_ILOCK_TRY) { + if (!inode_trylock_shared(&inode->vfs_inode)) + return -EAGAIN; + else + return 0; + } + inode_lock_shared(&inode->vfs_inode); + } else { + if (ilock_flags & BTRFS_ILOCK_TRY) { + if (!inode_trylock(&inode->vfs_inode)) + return -EAGAIN; + else + return 0; + } + inode_lock(&inode->vfs_inode); + } + if (ilock_flags & BTRFS_ILOCK_MMAP) + down_write(&inode->i_mmap_lock); + return 0; +} + +/* + * btrfs_inode_unlock - unock inode i_rwsem + * + * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() + * to decide whether the lock acquired is shared or exclusive. + */ +void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) +{ + if (ilock_flags & BTRFS_ILOCK_MMAP) + up_write(&inode->i_mmap_lock); + if (ilock_flags & BTRFS_ILOCK_SHARED) + inode_unlock_shared(&inode->vfs_inode); + else + inode_unlock(&inode->vfs_inode); +} + +/* + * Cleanup all submitted ordered extents in specified range to handle errors + * from the btrfs_run_delalloc_range() callback. + * + * NOTE: caller must ensure that when an error happens, it can not call + * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING + * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata + * to be released, which we want to happen only when finishing the ordered + * extent (btrfs_finish_ordered_io()). + */ +static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, + struct page *locked_page, + u64 offset, u64 bytes) +{ + unsigned long index = offset >> PAGE_SHIFT; + unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; + u64 page_start = 0, page_end = 0; + struct page *page; + + if (locked_page) { + page_start = page_offset(locked_page); + page_end = page_start + PAGE_SIZE - 1; + } + + while (index <= end_index) { + /* + * For locked page, we will call btrfs_mark_ordered_io_finished + * through btrfs_mark_ordered_io_finished() on it + * in run_delalloc_range() for the error handling, which will + * clear page Ordered and run the ordered extent accounting. + * + * Here we can't just clear the Ordered bit, or + * btrfs_mark_ordered_io_finished() would skip the accounting + * for the page range, and the ordered extent will never finish. + */ + if (locked_page && index == (page_start >> PAGE_SHIFT)) { + index++; + continue; + } + page = find_get_page(inode->vfs_inode.i_mapping, index); + index++; + if (!page) + continue; + + /* + * Here we just clear all Ordered bits for every page in the + * range, then btrfs_mark_ordered_io_finished() will handle + * the ordered extent accounting for the range. + */ + btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, + offset, bytes); + put_page(page); + } + + if (locked_page) { + /* The locked page covers the full range, nothing needs to be done */ + if (bytes + offset <= page_start + PAGE_SIZE) + return; + /* + * In case this page belongs to the delalloc range being + * instantiated then skip it, since the first page of a range is + * going to be properly cleaned up by the caller of + * run_delalloc_range + */ + if (page_start >= offset && page_end <= (offset + bytes - 1)) { + bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; + offset = page_offset(locked_page) + PAGE_SIZE; + } + } + + return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); +} + +static int btrfs_dirty_inode(struct btrfs_inode *inode); + +static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, + struct btrfs_new_inode_args *args) +{ + int err; + + if (args->default_acl) { + err = __btrfs_set_acl(trans, args->inode, args->default_acl, + ACL_TYPE_DEFAULT); + if (err) + return err; + } + if (args->acl) { + err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); + if (err) + return err; + } + if (!args->default_acl && !args->acl) + cache_no_acl(args->inode); + return btrfs_xattr_security_init(trans, args->inode, args->dir, + &args->dentry->d_name); +} + +/* + * this does all the hard work for inserting an inline extent into + * the btree. The caller should have done a btrfs_drop_extents so that + * no overlapping inline items exist in the btree + */ +static int insert_inline_extent(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_inode *inode, bool extent_inserted, + size_t size, size_t compressed_size, + int compress_type, + struct page **compressed_pages, + bool update_i_size) +{ + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf; + struct page *page = NULL; + char *kaddr; + unsigned long ptr; + struct btrfs_file_extent_item *ei; + int ret; + size_t cur_size = size; + u64 i_size; + + ASSERT((compressed_size > 0 && compressed_pages) || + (compressed_size == 0 && !compressed_pages)); + + if (compressed_size && compressed_pages) + cur_size = compressed_size; + + if (!extent_inserted) { + struct btrfs_key key; + size_t datasize; + + key.objectid = btrfs_ino(inode); + key.offset = 0; + key.type = BTRFS_EXTENT_DATA_KEY; + + datasize = btrfs_file_extent_calc_inline_size(cur_size); + ret = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + if (ret) + goto fail; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, size); + ptr = btrfs_file_extent_inline_start(ei); + + if (compress_type != BTRFS_COMPRESS_NONE) { + struct page *cpage; + int i = 0; + while (compressed_size > 0) { + cpage = compressed_pages[i]; + cur_size = min_t(unsigned long, compressed_size, + PAGE_SIZE); + + kaddr = kmap_local_page(cpage); + write_extent_buffer(leaf, kaddr, ptr, cur_size); + kunmap_local(kaddr); + + i++; + ptr += cur_size; + compressed_size -= cur_size; + } + btrfs_set_file_extent_compression(leaf, ei, + compress_type); + } else { + page = find_get_page(inode->vfs_inode.i_mapping, 0); + btrfs_set_file_extent_compression(leaf, ei, 0); + kaddr = kmap_local_page(page); + write_extent_buffer(leaf, kaddr, ptr, size); + kunmap_local(kaddr); + put_page(page); + } + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_release_path(path); + + /* + * We align size to sectorsize for inline extents just for simplicity + * sake. + */ + ret = btrfs_inode_set_file_extent_range(inode, 0, + ALIGN(size, root->fs_info->sectorsize)); + if (ret) + goto fail; + + /* + * We're an inline extent, so nobody can extend the file past i_size + * without locking a page we already have locked. + * + * We must do any i_size and inode updates before we unlock the pages. + * Otherwise we could end up racing with unlink. + */ + i_size = i_size_read(&inode->vfs_inode); + if (update_i_size && size > i_size) { + i_size_write(&inode->vfs_inode, size); + i_size = size; + } + inode->disk_i_size = i_size; + +fail: + return ret; +} + + +/* + * conditionally insert an inline extent into the file. This + * does the checks required to make sure the data is small enough + * to fit as an inline extent. + */ +static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, + size_t compressed_size, + int compress_type, + struct page **compressed_pages, + bool update_i_size) +{ + struct btrfs_drop_extents_args drop_args = { 0 }; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_trans_handle *trans; + u64 data_len = (compressed_size ?: size); + int ret; + struct btrfs_path *path; + + /* + * We can create an inline extent if it ends at or beyond the current + * i_size, is no larger than a sector (decompressed), and the (possibly + * compressed) data fits in a leaf and the configured maximum inline + * size. + */ + if (size < i_size_read(&inode->vfs_inode) || + size > fs_info->sectorsize || + data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || + data_len > fs_info->max_inline) + return 1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + trans->block_rsv = &inode->block_rsv; + + drop_args.path = path; + drop_args.start = 0; + drop_args.end = fs_info->sectorsize; + drop_args.drop_cache = true; + drop_args.replace_extent = true; + drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, + size, compressed_size, compress_type, + compressed_pages, update_i_size); + if (ret && ret != -ENOSPC) { + btrfs_abort_transaction(trans, ret); + goto out; + } else if (ret == -ENOSPC) { + ret = 1; + goto out; + } + + btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); + ret = btrfs_update_inode(trans, root, inode); + if (ret && ret != -ENOSPC) { + btrfs_abort_transaction(trans, ret); + goto out; + } else if (ret == -ENOSPC) { + ret = 1; + goto out; + } + + btrfs_set_inode_full_sync(inode); +out: + /* + * Don't forget to free the reserved space, as for inlined extent + * it won't count as data extent, free them directly here. + * And at reserve time, it's always aligned to page size, so + * just free one page here. + */ + btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); + btrfs_free_path(path); + btrfs_end_transaction(trans); + return ret; +} + +struct async_extent { + u64 start; + u64 ram_size; + u64 compressed_size; + struct page **pages; + unsigned long nr_pages; + int compress_type; + struct list_head list; +}; + +struct async_chunk { + struct btrfs_inode *inode; + struct page *locked_page; + u64 start; + u64 end; + blk_opf_t write_flags; + struct list_head extents; + struct cgroup_subsys_state *blkcg_css; + struct btrfs_work work; + struct async_cow *async_cow; +}; + +struct async_cow { + atomic_t num_chunks; + struct async_chunk chunks[]; +}; + +static noinline int add_async_extent(struct async_chunk *cow, + u64 start, u64 ram_size, + u64 compressed_size, + struct page **pages, + unsigned long nr_pages, + int compress_type) +{ + struct async_extent *async_extent; + + async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); + BUG_ON(!async_extent); /* -ENOMEM */ + async_extent->start = start; + async_extent->ram_size = ram_size; + async_extent->compressed_size = compressed_size; + async_extent->pages = pages; + async_extent->nr_pages = nr_pages; + async_extent->compress_type = compress_type; + list_add_tail(&async_extent->list, &cow->extents); + return 0; +} + +/* + * Check if the inode needs to be submitted to compression, based on mount + * options, defragmentation, properties or heuristics. + */ +static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, + u64 end) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (!btrfs_inode_can_compress(inode)) { + WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), + KERN_ERR "BTRFS: unexpected compression for ino %llu\n", + btrfs_ino(inode)); + return 0; + } + /* + * Special check for subpage. + * + * We lock the full page then run each delalloc range in the page, thus + * for the following case, we will hit some subpage specific corner case: + * + * 0 32K 64K + * | |///////| |///////| + * \- A \- B + * + * In above case, both range A and range B will try to unlock the full + * page [0, 64K), causing the one finished later will have page + * unlocked already, triggering various page lock requirement BUG_ON()s. + * + * So here we add an artificial limit that subpage compression can only + * if the range is fully page aligned. + * + * In theory we only need to ensure the first page is fully covered, but + * the tailing partial page will be locked until the full compression + * finishes, delaying the write of other range. + * + * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range + * first to prevent any submitted async extent to unlock the full page. + * By this, we can ensure for subpage case that only the last async_cow + * will unlock the full page. + */ + if (fs_info->sectorsize < PAGE_SIZE) { + if (!PAGE_ALIGNED(start) || + !PAGE_ALIGNED(end + 1)) + return 0; + } + + /* force compress */ + if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) + return 1; + /* defrag ioctl */ + if (inode->defrag_compress) + return 1; + /* bad compression ratios */ + if (inode->flags & BTRFS_INODE_NOCOMPRESS) + return 0; + if (btrfs_test_opt(fs_info, COMPRESS) || + inode->flags & BTRFS_INODE_COMPRESS || + inode->prop_compress) + return btrfs_compress_heuristic(&inode->vfs_inode, start, end); + return 0; +} + +static inline void inode_should_defrag(struct btrfs_inode *inode, + u64 start, u64 end, u64 num_bytes, u32 small_write) +{ + /* If this is a small write inside eof, kick off a defrag */ + if (num_bytes < small_write && + (start > 0 || end + 1 < inode->disk_i_size)) + btrfs_add_inode_defrag(NULL, inode, small_write); +} + +/* + * Work queue call back to started compression on a file and pages. + * + * This is done inside an ordered work queue, and the compression is spread + * across many cpus. The actual IO submission is step two, and the ordered work + * queue takes care of making sure that happens in the same order things were + * put onto the queue by writepages and friends. + * + * If this code finds it can't get good compression, it puts an entry onto the + * work queue to write the uncompressed bytes. This makes sure that both + * compressed inodes and uncompressed inodes are written in the same order that + * the flusher thread sent them down. + */ +static void compress_file_range(struct btrfs_work *work) +{ + struct async_chunk *async_chunk = + container_of(work, struct async_chunk, work); + struct btrfs_inode *inode = async_chunk->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct address_space *mapping = inode->vfs_inode.i_mapping; + u64 blocksize = fs_info->sectorsize; + u64 start = async_chunk->start; + u64 end = async_chunk->end; + u64 actual_end; + u64 i_size; + int ret = 0; + struct page **pages; + unsigned long nr_pages; + unsigned long total_compressed = 0; + unsigned long total_in = 0; + unsigned int poff; + int i; + int compress_type = fs_info->compress_type; + + inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); + + /* + * We need to call clear_page_dirty_for_io on each page in the range. + * Otherwise applications with the file mmap'd can wander in and change + * the page contents while we are compressing them. + */ + extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + + /* + * We need to save i_size before now because it could change in between + * us evaluating the size and assigning it. This is because we lock and + * unlock the page in truncate and fallocate, and then modify the i_size + * later on. + * + * The barriers are to emulate READ_ONCE, remove that once i_size_read + * does that for us. + */ + barrier(); + i_size = i_size_read(&inode->vfs_inode); + barrier(); + actual_end = min_t(u64, i_size, end + 1); +again: + pages = NULL; + nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; + nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES); + + /* + * we don't want to send crud past the end of i_size through + * compression, that's just a waste of CPU time. So, if the + * end of the file is before the start of our current + * requested range of bytes, we bail out to the uncompressed + * cleanup code that can deal with all of this. + * + * It isn't really the fastest way to fix things, but this is a + * very uncommon corner. + */ + if (actual_end <= start) + goto cleanup_and_bail_uncompressed; + + total_compressed = actual_end - start; + + /* + * Skip compression for a small file range(<=blocksize) that + * isn't an inline extent, since it doesn't save disk space at all. + */ + if (total_compressed <= blocksize && + (start > 0 || end + 1 < inode->disk_i_size)) + goto cleanup_and_bail_uncompressed; + + /* + * For subpage case, we require full page alignment for the sector + * aligned range. + * Thus we must also check against @actual_end, not just @end. + */ + if (blocksize < PAGE_SIZE) { + if (!PAGE_ALIGNED(start) || + !PAGE_ALIGNED(round_up(actual_end, blocksize))) + goto cleanup_and_bail_uncompressed; + } + + total_compressed = min_t(unsigned long, total_compressed, + BTRFS_MAX_UNCOMPRESSED); + total_in = 0; + ret = 0; + + /* + * We do compression for mount -o compress and when the inode has not + * been flagged as NOCOMPRESS. This flag can change at any time if we + * discover bad compression ratios. + */ + if (!inode_need_compress(inode, start, end)) + goto cleanup_and_bail_uncompressed; + + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) { + /* + * Memory allocation failure is not a fatal error, we can fall + * back to uncompressed code. + */ + goto cleanup_and_bail_uncompressed; + } + + if (inode->defrag_compress) + compress_type = inode->defrag_compress; + else if (inode->prop_compress) + compress_type = inode->prop_compress; + + /* Compression level is applied here. */ + ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4), + mapping, start, pages, &nr_pages, &total_in, + &total_compressed); + if (ret) + goto mark_incompressible; + + /* + * Zero the tail end of the last page, as we might be sending it down + * to disk. + */ + poff = offset_in_page(total_compressed); + if (poff) + memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff); + + /* + * Try to create an inline extent. + * + * If we didn't compress the entire range, try to create an uncompressed + * inline extent, else a compressed one. + * + * Check cow_file_range() for why we don't even try to create inline + * extent for the subpage case. + */ + if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { + if (total_in < actual_end) { + ret = cow_file_range_inline(inode, actual_end, 0, + BTRFS_COMPRESS_NONE, NULL, + false); + } else { + ret = cow_file_range_inline(inode, actual_end, + total_compressed, + compress_type, pages, + false); + } + if (ret <= 0) { + unsigned long clear_flags = EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | + EXTENT_DO_ACCOUNTING; + + if (ret < 0) + mapping_set_error(mapping, -EIO); + + /* + * inline extent creation worked or returned error, + * we don't need to create any more async work items. + * Unlock and free up our temp pages. + * + * We use DO_ACCOUNTING here because we need the + * delalloc_release_metadata to be done _after_ we drop + * our outstanding extent for clearing delalloc for this + * range. + */ + extent_clear_unlock_delalloc(inode, start, end, + NULL, + clear_flags, + PAGE_UNLOCK | + PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK); + goto free_pages; + } + } + + /* + * We aren't doing an inline extent. Round the compressed size up to a + * block size boundary so the allocator does sane things. + */ + total_compressed = ALIGN(total_compressed, blocksize); + + /* + * One last check to make sure the compression is really a win, compare + * the page count read with the blocks on disk, compression must free at + * least one sector. + */ + total_in = round_up(total_in, fs_info->sectorsize); + if (total_compressed + blocksize > total_in) + goto mark_incompressible; + + /* + * The async work queues will take care of doing actual allocation on + * disk for these compressed pages, and will submit the bios. + */ + add_async_extent(async_chunk, start, total_in, total_compressed, pages, + nr_pages, compress_type); + if (start + total_in < end) { + start += total_in; + cond_resched(); + goto again; + } + return; + +mark_incompressible: + if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) + inode->flags |= BTRFS_INODE_NOCOMPRESS; +cleanup_and_bail_uncompressed: + add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, + BTRFS_COMPRESS_NONE); +free_pages: + if (pages) { + for (i = 0; i < nr_pages; i++) { + WARN_ON(pages[i]->mapping); + put_page(pages[i]); + } + kfree(pages); + } +} + +static void free_async_extent_pages(struct async_extent *async_extent) +{ + int i; + + if (!async_extent->pages) + return; + + for (i = 0; i < async_extent->nr_pages; i++) { + WARN_ON(async_extent->pages[i]->mapping); + put_page(async_extent->pages[i]); + } + kfree(async_extent->pages); + async_extent->nr_pages = 0; + async_extent->pages = NULL; +} + +static void submit_uncompressed_range(struct btrfs_inode *inode, + struct async_extent *async_extent, + struct page *locked_page) +{ + u64 start = async_extent->start; + u64 end = async_extent->start + async_extent->ram_size - 1; + int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .range_start = start, + .range_end = end, + .no_cgroup_owner = 1, + }; + + wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); + ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false); + wbc_detach_inode(&wbc); + if (ret < 0) { + btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); + if (locked_page) { + const u64 page_start = page_offset(locked_page); + + set_page_writeback(locked_page); + end_page_writeback(locked_page); + btrfs_mark_ordered_io_finished(inode, locked_page, + page_start, PAGE_SIZE, + !ret); + mapping_set_error(locked_page->mapping, ret); + unlock_page(locked_page); + } + } +} + +static void submit_one_async_extent(struct async_chunk *async_chunk, + struct async_extent *async_extent, + u64 *alloc_hint) +{ + struct btrfs_inode *inode = async_chunk->inode; + struct extent_io_tree *io_tree = &inode->io_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ordered_extent *ordered; + struct btrfs_key ins; + struct page *locked_page = NULL; + struct extent_map *em; + int ret = 0; + u64 start = async_extent->start; + u64 end = async_extent->start + async_extent->ram_size - 1; + + if (async_chunk->blkcg_css) + kthread_associate_blkcg(async_chunk->blkcg_css); + + /* + * If async_chunk->locked_page is in the async_extent range, we need to + * handle it. + */ + if (async_chunk->locked_page) { + u64 locked_page_start = page_offset(async_chunk->locked_page); + u64 locked_page_end = locked_page_start + PAGE_SIZE - 1; + + if (!(start >= locked_page_end || end <= locked_page_start)) + locked_page = async_chunk->locked_page; + } + lock_extent(io_tree, start, end, NULL); + + if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { + submit_uncompressed_range(inode, async_extent, locked_page); + goto done; + } + + ret = btrfs_reserve_extent(root, async_extent->ram_size, + async_extent->compressed_size, + async_extent->compressed_size, + 0, *alloc_hint, &ins, 1, 1); + if (ret) { + /* + * Here we used to try again by going back to non-compressed + * path for ENOSPC. But we can't reserve space even for + * compressed size, how could it work for uncompressed size + * which requires larger size? So here we directly go error + * path. + */ + goto out_free; + } + + /* Here we're doing allocation and writeback of the compressed pages */ + em = create_io_em(inode, start, + async_extent->ram_size, /* len */ + start, /* orig_start */ + ins.objectid, /* block_start */ + ins.offset, /* block_len */ + ins.offset, /* orig_block_len */ + async_extent->ram_size, /* ram_bytes */ + async_extent->compress_type, + BTRFS_ORDERED_COMPRESSED); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_free_reserve; + } + free_extent_map(em); + + ordered = btrfs_alloc_ordered_extent(inode, start, /* file_offset */ + async_extent->ram_size, /* num_bytes */ + async_extent->ram_size, /* ram_bytes */ + ins.objectid, /* disk_bytenr */ + ins.offset, /* disk_num_bytes */ + 0, /* offset */ + 1 << BTRFS_ORDERED_COMPRESSED, + async_extent->compress_type); + if (IS_ERR(ordered)) { + btrfs_drop_extent_map_range(inode, start, end, false); + ret = PTR_ERR(ordered); + goto out_free_reserve; + } + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + + /* Clear dirty, set writeback and unlock the pages. */ + extent_clear_unlock_delalloc(inode, start, end, + NULL, EXTENT_LOCKED | EXTENT_DELALLOC, + PAGE_UNLOCK | PAGE_START_WRITEBACK); + btrfs_submit_compressed_write(ordered, + async_extent->pages, /* compressed_pages */ + async_extent->nr_pages, + async_chunk->write_flags, true); + *alloc_hint = ins.objectid + ins.offset; +done: + if (async_chunk->blkcg_css) + kthread_associate_blkcg(NULL); + kfree(async_extent); + return; + +out_free_reserve: + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); +out_free: + mapping_set_error(inode->vfs_inode.i_mapping, -EIO); + extent_clear_unlock_delalloc(inode, start, end, + NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK); + free_async_extent_pages(async_extent); + if (async_chunk->blkcg_css) + kthread_associate_blkcg(NULL); + btrfs_debug(fs_info, +"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", + root->root_key.objectid, btrfs_ino(inode), start, + async_extent->ram_size, ret); + kfree(async_extent); +} + +static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, + u64 num_bytes) +{ + struct extent_map_tree *em_tree = &inode->extent_tree; + struct extent_map *em; + u64 alloc_hint = 0; + + read_lock(&em_tree->lock); + em = search_extent_mapping(em_tree, start, num_bytes); + if (em) { + /* + * if block start isn't an actual block number then find the + * first block in this inode and use that as a hint. If that + * block is also bogus then just don't worry about it. + */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + em = search_extent_mapping(em_tree, 0, 0); + if (em && em->block_start < EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + if (em) + free_extent_map(em); + } else { + alloc_hint = em->block_start; + free_extent_map(em); + } + } + read_unlock(&em_tree->lock); + + return alloc_hint; +} + +/* + * when extent_io.c finds a delayed allocation range in the file, + * the call backs end up in this code. The basic idea is to + * allocate extents on disk for the range, and create ordered data structs + * in ram to track those extents. + * + * locked_page is the page that writepage had locked already. We use + * it to make sure we don't do extra locks or unlocks. + * + * When this function fails, it unlocks all pages except @locked_page. + * + * When this function successfully creates an inline extent, it returns 1 and + * unlocks all pages including locked_page and starts I/O on them. + * (In reality inline extents are limited to a single page, so locked_page is + * the only page handled anyway). + * + * When this function succeed and creates a normal extent, the page locking + * status depends on the passed in flags: + * + * - If @keep_locked is set, all pages are kept locked. + * - Else all pages except for @locked_page are unlocked. + * + * When a failure happens in the second or later iteration of the + * while-loop, the ordered extents created in previous iterations are kept + * intact. So, the caller must clean them up by calling + * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for + * example. + */ +static noinline int cow_file_range(struct btrfs_inode *inode, + struct page *locked_page, u64 start, u64 end, + u64 *done_offset, + bool keep_locked, bool no_inline) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 alloc_hint = 0; + u64 orig_start = start; + u64 num_bytes; + unsigned long ram_size; + u64 cur_alloc_size = 0; + u64 min_alloc_size; + u64 blocksize = fs_info->sectorsize; + struct btrfs_key ins; + struct extent_map *em; + unsigned clear_bits; + unsigned long page_ops; + bool extent_reserved = false; + int ret = 0; + + if (btrfs_is_free_space_inode(inode)) { + ret = -EINVAL; + goto out_unlock; + } + + num_bytes = ALIGN(end - start + 1, blocksize); + num_bytes = max(blocksize, num_bytes); + ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy)); + + inode_should_defrag(inode, start, end, num_bytes, SZ_64K); + + /* + * Due to the page size limit, for subpage we can only trigger the + * writeback for the dirty sectors of page, that means data writeback + * is doing more writeback than what we want. + * + * This is especially unexpected for some call sites like fallocate, + * where we only increase i_size after everything is done. + * This means we can trigger inline extent even if we didn't want to. + * So here we skip inline extent creation completely. + */ + if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) { + u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), + end + 1); + + /* lets try to make an inline extent */ + ret = cow_file_range_inline(inode, actual_end, 0, + BTRFS_COMPRESS_NONE, NULL, false); + if (ret == 0) { + /* + * We use DO_ACCOUNTING here because we need the + * delalloc_release_metadata to be run _after_ we drop + * our outstanding extent for clearing delalloc for this + * range. + */ + extent_clear_unlock_delalloc(inode, start, end, + locked_page, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | + EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + /* + * locked_page is locked by the caller of + * writepage_delalloc(), not locked by + * __process_pages_contig(). + * + * We can't let __process_pages_contig() to unlock it, + * as it doesn't have any subpage::writers recorded. + * + * Here we manually unlock the page, since the caller + * can't determine if it's an inline extent or a + * compressed extent. + */ + unlock_page(locked_page); + ret = 1; + goto done; + } else if (ret < 0) { + goto out_unlock; + } + } + + alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); + + /* + * Relocation relies on the relocated extents to have exactly the same + * size as the original extents. Normally writeback for relocation data + * extents follows a NOCOW path because relocation preallocates the + * extents. However, due to an operation such as scrub turning a block + * group to RO mode, it may fallback to COW mode, so we must make sure + * an extent allocated during COW has exactly the requested size and can + * not be split into smaller extents, otherwise relocation breaks and + * fails during the stage where it updates the bytenr of file extent + * items. + */ + if (btrfs_is_data_reloc_root(root)) + min_alloc_size = num_bytes; + else + min_alloc_size = fs_info->sectorsize; + + while (num_bytes > 0) { + struct btrfs_ordered_extent *ordered; + + cur_alloc_size = num_bytes; + ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, + min_alloc_size, 0, alloc_hint, + &ins, 1, 1); + if (ret == -EAGAIN) { + /* + * btrfs_reserve_extent only returns -EAGAIN for zoned + * file systems, which is an indication that there are + * no active zones to allocate from at the moment. + * + * If this is the first loop iteration, wait for at + * least one zone to finish before retrying the + * allocation. Otherwise ask the caller to write out + * the already allocated blocks before coming back to + * us, or return -ENOSPC if it can't handle retries. + */ + ASSERT(btrfs_is_zoned(fs_info)); + if (start == orig_start) { + wait_on_bit_io(&inode->root->fs_info->flags, + BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); + continue; + } + if (done_offset) { + *done_offset = start - 1; + return 0; + } + ret = -ENOSPC; + } + if (ret < 0) + goto out_unlock; + cur_alloc_size = ins.offset; + extent_reserved = true; + + ram_size = ins.offset; + em = create_io_em(inode, start, ins.offset, /* len */ + start, /* orig_start */ + ins.objectid, /* block_start */ + ins.offset, /* block_len */ + ins.offset, /* orig_block_len */ + ram_size, /* ram_bytes */ + BTRFS_COMPRESS_NONE, /* compress_type */ + BTRFS_ORDERED_REGULAR /* type */); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_reserve; + } + free_extent_map(em); + + ordered = btrfs_alloc_ordered_extent(inode, start, ram_size, + ram_size, ins.objectid, cur_alloc_size, + 0, 1 << BTRFS_ORDERED_REGULAR, + BTRFS_COMPRESS_NONE); + if (IS_ERR(ordered)) { + ret = PTR_ERR(ordered); + goto out_drop_extent_cache; + } + + if (btrfs_is_data_reloc_root(root)) { + ret = btrfs_reloc_clone_csums(ordered); + + /* + * Only drop cache here, and process as normal. + * + * We must not allow extent_clear_unlock_delalloc() + * at out_unlock label to free meta of this ordered + * extent, as its meta should be freed by + * btrfs_finish_ordered_io(). + * + * So we must continue until @start is increased to + * skip current ordered extent. + */ + if (ret) + btrfs_drop_extent_map_range(inode, start, + start + ram_size - 1, + false); + } + btrfs_put_ordered_extent(ordered); + + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + + /* + * We're not doing compressed IO, don't unlock the first page + * (which the caller expects to stay locked), don't clear any + * dirty bits and don't set any writeback bits + * + * Do set the Ordered (Private2) bit so we know this page was + * properly setup for writepage. + */ + page_ops = (keep_locked ? 0 : PAGE_UNLOCK); + page_ops |= PAGE_SET_ORDERED; + + extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, + locked_page, + EXTENT_LOCKED | EXTENT_DELALLOC, + page_ops); + if (num_bytes < cur_alloc_size) + num_bytes = 0; + else + num_bytes -= cur_alloc_size; + alloc_hint = ins.objectid + ins.offset; + start += cur_alloc_size; + extent_reserved = false; + + /* + * btrfs_reloc_clone_csums() error, since start is increased + * extent_clear_unlock_delalloc() at out_unlock label won't + * free metadata of current ordered extent, we're OK to exit. + */ + if (ret) + goto out_unlock; + } +done: + if (done_offset) + *done_offset = end; + return ret; + +out_drop_extent_cache: + btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); +out_reserve: + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); +out_unlock: + /* + * Now, we have three regions to clean up: + * + * |-------(1)----|---(2)---|-------------(3)----------| + * `- orig_start `- start `- start + cur_alloc_size `- end + * + * We process each region below. + */ + + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + + /* + * For the range (1). We have already instantiated the ordered extents + * for this region. They are cleaned up by + * btrfs_cleanup_ordered_extents() in e.g, + * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are + * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | + * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup + * function. + * + * However, in case of @keep_locked, we still need to unlock the pages + * (except @locked_page) to ensure all the pages are unlocked. + */ + if (keep_locked && orig_start < start) { + if (!locked_page) + mapping_set_error(inode->vfs_inode.i_mapping, ret); + extent_clear_unlock_delalloc(inode, orig_start, start - 1, + locked_page, 0, page_ops); + } + + /* + * For the range (2). If we reserved an extent for our delalloc range + * (or a subrange) and failed to create the respective ordered extent, + * then it means that when we reserved the extent we decremented the + * extent's size from the data space_info's bytes_may_use counter and + * incremented the space_info's bytes_reserved counter by the same + * amount. We must make sure extent_clear_unlock_delalloc() does not try + * to decrement again the data space_info's bytes_may_use counter, + * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. + */ + if (extent_reserved) { + extent_clear_unlock_delalloc(inode, start, + start + cur_alloc_size - 1, + locked_page, + clear_bits, + page_ops); + start += cur_alloc_size; + } + + /* + * For the range (3). We never touched the region. In addition to the + * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data + * space_info's bytes_may_use counter, reserved in + * btrfs_check_data_free_space(). + */ + if (start < end) { + clear_bits |= EXTENT_CLEAR_DATA_RESV; + extent_clear_unlock_delalloc(inode, start, end, locked_page, + clear_bits, page_ops); + } + return ret; +} + +/* + * Phase two of compressed writeback. This is the ordered portion of the code, + * which only gets called in the order the work was queued. We walk all the + * async extents created by compress_file_range and send them down to the disk. + */ +static noinline void submit_compressed_extents(struct btrfs_work *work) +{ + struct async_chunk *async_chunk = container_of(work, struct async_chunk, + work); + struct btrfs_fs_info *fs_info = btrfs_work_owner(work); + struct async_extent *async_extent; + unsigned long nr_pages; + u64 alloc_hint = 0; + + nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> + PAGE_SHIFT; + + while (!list_empty(&async_chunk->extents)) { + async_extent = list_entry(async_chunk->extents.next, + struct async_extent, list); + list_del(&async_extent->list); + submit_one_async_extent(async_chunk, async_extent, &alloc_hint); + } + + /* atomic_sub_return implies a barrier */ + if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < + 5 * SZ_1M) + cond_wake_up_nomb(&fs_info->async_submit_wait); +} + +static noinline void async_cow_free(struct btrfs_work *work) +{ + struct async_chunk *async_chunk; + struct async_cow *async_cow; + + async_chunk = container_of(work, struct async_chunk, work); + btrfs_add_delayed_iput(async_chunk->inode); + if (async_chunk->blkcg_css) + css_put(async_chunk->blkcg_css); + + async_cow = async_chunk->async_cow; + if (atomic_dec_and_test(&async_cow->num_chunks)) + kvfree(async_cow); +} + +static bool run_delalloc_compressed(struct btrfs_inode *inode, + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); + struct async_cow *ctx; + struct async_chunk *async_chunk; + unsigned long nr_pages; + u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); + int i; + unsigned nofs_flag; + const blk_opf_t write_flags = wbc_to_write_flags(wbc); + + nofs_flag = memalloc_nofs_save(); + ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + if (!ctx) + return false; + + unlock_extent(&inode->io_tree, start, end, NULL); + set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); + + async_chunk = ctx->chunks; + atomic_set(&ctx->num_chunks, num_chunks); + + for (i = 0; i < num_chunks; i++) { + u64 cur_end = min(end, start + SZ_512K - 1); + + /* + * igrab is called higher up in the call chain, take only the + * lightweight reference for the callback lifetime + */ + ihold(&inode->vfs_inode); + async_chunk[i].async_cow = ctx; + async_chunk[i].inode = inode; + async_chunk[i].start = start; + async_chunk[i].end = cur_end; + async_chunk[i].write_flags = write_flags; + INIT_LIST_HEAD(&async_chunk[i].extents); + + /* + * The locked_page comes all the way from writepage and its + * the original page we were actually given. As we spread + * this large delalloc region across multiple async_chunk + * structs, only the first struct needs a pointer to locked_page + * + * This way we don't need racey decisions about who is supposed + * to unlock it. + */ + if (locked_page) { + /* + * Depending on the compressibility, the pages might or + * might not go through async. We want all of them to + * be accounted against wbc once. Let's do it here + * before the paths diverge. wbc accounting is used + * only for foreign writeback detection and doesn't + * need full accuracy. Just account the whole thing + * against the first page. + */ + wbc_account_cgroup_owner(wbc, locked_page, + cur_end - start); + async_chunk[i].locked_page = locked_page; + locked_page = NULL; + } else { + async_chunk[i].locked_page = NULL; + } + + if (blkcg_css != blkcg_root_css) { + css_get(blkcg_css); + async_chunk[i].blkcg_css = blkcg_css; + async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT; + } else { + async_chunk[i].blkcg_css = NULL; + } + + btrfs_init_work(&async_chunk[i].work, compress_file_range, + submit_compressed_extents, async_cow_free); + + nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); + atomic_add(nr_pages, &fs_info->async_delalloc_pages); + + btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); + + start = cur_end + 1; + } + return true; +} + +/* + * Run the delalloc range from start to end, and write back any dirty pages + * covered by the range. + */ +static noinline int run_delalloc_cow(struct btrfs_inode *inode, + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc, + bool pages_dirty) +{ + u64 done_offset = end; + int ret; + + while (start <= end) { + ret = cow_file_range(inode, locked_page, start, end, &done_offset, + true, false); + if (ret) + return ret; + extent_write_locked_range(&inode->vfs_inode, locked_page, start, + done_offset, wbc, pages_dirty); + start = done_offset + 1; + } + + return 1; +} + +static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, + u64 bytenr, u64 num_bytes, bool nowait) +{ + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr); + struct btrfs_ordered_sum *sums; + int ret; + LIST_HEAD(list); + + ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1, + &list, 0, nowait); + if (ret == 0 && list_empty(&list)) + return 0; + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del(&sums->list); + kfree(sums); + } + if (ret < 0) + return ret; + return 1; +} + +static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, + const u64 start, const u64 end) +{ + const bool is_space_ino = btrfs_is_free_space_inode(inode); + const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); + const u64 range_bytes = end + 1 - start; + struct extent_io_tree *io_tree = &inode->io_tree; + u64 range_start = start; + u64 count; + int ret; + + /* + * If EXTENT_NORESERVE is set it means that when the buffered write was + * made we had not enough available data space and therefore we did not + * reserve data space for it, since we though we could do NOCOW for the + * respective file range (either there is prealloc extent or the inode + * has the NOCOW bit set). + * + * However when we need to fallback to COW mode (because for example the + * block group for the corresponding extent was turned to RO mode by a + * scrub or relocation) we need to do the following: + * + * 1) We increment the bytes_may_use counter of the data space info. + * If COW succeeds, it allocates a new data extent and after doing + * that it decrements the space info's bytes_may_use counter and + * increments its bytes_reserved counter by the same amount (we do + * this at btrfs_add_reserved_bytes()). So we need to increment the + * bytes_may_use counter to compensate (when space is reserved at + * buffered write time, the bytes_may_use counter is incremented); + * + * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so + * that if the COW path fails for any reason, it decrements (through + * extent_clear_unlock_delalloc()) the bytes_may_use counter of the + * data space info, which we incremented in the step above. + * + * If we need to fallback to cow and the inode corresponds to a free + * space cache inode or an inode of the data relocation tree, we must + * also increment bytes_may_use of the data space_info for the same + * reason. Space caches and relocated data extents always get a prealloc + * extent for them, however scrub or balance may have set the block + * group that contains that extent to RO mode and therefore force COW + * when starting writeback. + */ + count = count_range_bits(io_tree, &range_start, end, range_bytes, + EXTENT_NORESERVE, 0, NULL); + if (count > 0 || is_space_ino || is_reloc_ino) { + u64 bytes = count; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_space_info *sinfo = fs_info->data_sinfo; + + if (is_space_ino || is_reloc_ino) + bytes = range_bytes; + + spin_lock(&sinfo->lock); + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); + spin_unlock(&sinfo->lock); + + if (count > 0) + clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, + NULL); + } + + /* + * Don't try to create inline extents, as a mix of inline extent that + * is written out and unlocked directly and a normal NOCOW extent + * doesn't work. + */ + ret = cow_file_range(inode, locked_page, start, end, NULL, false, true); + ASSERT(ret != 1); + return ret; +} + +struct can_nocow_file_extent_args { + /* Input fields. */ + + /* Start file offset of the range we want to NOCOW. */ + u64 start; + /* End file offset (inclusive) of the range we want to NOCOW. */ + u64 end; + bool writeback_path; + bool strict; + /* + * Free the path passed to can_nocow_file_extent() once it's not needed + * anymore. + */ + bool free_path; + + /* Output fields. Only set when can_nocow_file_extent() returns 1. */ + + u64 disk_bytenr; + u64 disk_num_bytes; + u64 extent_offset; + /* Number of bytes that can be written to in NOCOW mode. */ + u64 num_bytes; +}; + +/* + * Check if we can NOCOW the file extent that the path points to. + * This function may return with the path released, so the caller should check + * if path->nodes[0] is NULL or not if it needs to use the path afterwards. + * + * Returns: < 0 on error + * 0 if we can not NOCOW + * 1 if we can NOCOW + */ +static int can_nocow_file_extent(struct btrfs_path *path, + struct btrfs_key *key, + struct btrfs_inode *inode, + struct can_nocow_file_extent_args *args) +{ + const bool is_freespace_inode = btrfs_is_free_space_inode(inode); + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *fi; + u64 extent_end; + u8 extent_type; + int can_nocow = 0; + int ret = 0; + bool nowait = path->nowait; + + fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + goto out; + + /* Can't access these fields unless we know it's not an inline extent. */ + args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + args->extent_offset = btrfs_file_extent_offset(leaf, fi); + + if (!(inode->flags & BTRFS_INODE_NODATACOW) && + extent_type == BTRFS_FILE_EXTENT_REG) + goto out; + + /* + * If the extent was created before the generation where the last snapshot + * for its subvolume was created, then this implies the extent is shared, + * hence we must COW. + */ + if (!args->strict && + btrfs_file_extent_generation(leaf, fi) <= + btrfs_root_last_snapshot(&root->root_item)) + goto out; + + /* An explicit hole, must COW. */ + if (args->disk_bytenr == 0) + goto out; + + /* Compressed/encrypted/encoded extents must be COWed. */ + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + goto out; + + extent_end = btrfs_file_extent_end(path); + + /* + * The following checks can be expensive, as they need to take other + * locks and do btree or rbtree searches, so release the path to avoid + * blocking other tasks for too long. + */ + btrfs_release_path(path); + + ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), + key->offset - args->extent_offset, + args->disk_bytenr, args->strict, path); + WARN_ON_ONCE(ret > 0 && is_freespace_inode); + if (ret != 0) + goto out; + + if (args->free_path) { + /* + * We don't need the path anymore, plus through the + * csum_exist_in_range() call below we will end up allocating + * another path. So free the path to avoid unnecessary extra + * memory usage. + */ + btrfs_free_path(path); + path = NULL; + } + + /* If there are pending snapshots for this root, we must COW. */ + if (args->writeback_path && !is_freespace_inode && + atomic_read(&root->snapshot_force_cow)) + goto out; + + args->disk_bytenr += args->extent_offset; + args->disk_bytenr += args->start - key->offset; + args->num_bytes = min(args->end + 1, extent_end) - args->start; + + /* + * Force COW if csums exist in the range. This ensures that csums for a + * given extent are either valid or do not exist. + */ + ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes, + nowait); + WARN_ON_ONCE(ret > 0 && is_freespace_inode); + if (ret != 0) + goto out; + + can_nocow = 1; + out: + if (args->free_path && path) + btrfs_free_path(path); + + return ret < 0 ? ret : can_nocow; +} + +/* + * when nowcow writeback call back. This checks for snapshots or COW copies + * of the extents that exist in the file, and COWs the file as required. + * + * If no cow copies or snapshots exist, we write directly to the existing + * blocks on disk + */ +static noinline int run_delalloc_nocow(struct btrfs_inode *inode, + struct page *locked_page, + const u64 start, const u64 end) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_root *root = inode->root; + struct btrfs_path *path; + u64 cow_start = (u64)-1; + u64 cur_offset = start; + int ret; + bool check_prev = true; + u64 ino = btrfs_ino(inode); + struct can_nocow_file_extent_args nocow_args = { 0 }; + + /* + * Normally on a zoned device we're only doing COW writes, but in case + * of relocation on a zoned filesystem serializes I/O so that we're only + * writing sequentially and can end up here as well. + */ + ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root)); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto error; + } + + nocow_args.end = end; + nocow_args.writeback_path = true; + + while (1) { + struct btrfs_block_group *nocow_bg = NULL; + struct btrfs_ordered_extent *ordered; + struct btrfs_key found_key; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + u64 extent_end; + u64 ram_bytes; + u64 nocow_end; + int extent_type; + bool is_prealloc; + + ret = btrfs_lookup_file_extent(NULL, root, path, ino, + cur_offset, 0); + if (ret < 0) + goto error; + + /* + * If there is no extent for our range when doing the initial + * search, then go back to the previous slot as it will be the + * one containing the search offset + */ + if (ret > 0 && path->slots[0] > 0 && check_prev) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, + path->slots[0] - 1); + if (found_key.objectid == ino && + found_key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + check_prev = false; +next_slot: + /* Go to next leaf if we have exhausted the current one */ + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto error; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + /* Didn't find anything for our INO */ + if (found_key.objectid > ino) + break; + /* + * Keep searching until we find an EXTENT_ITEM or there are no + * more extents for this inode + */ + if (WARN_ON_ONCE(found_key.objectid < ino) || + found_key.type < BTRFS_EXTENT_DATA_KEY) { + path->slots[0]++; + goto next_slot; + } + + /* Found key is not EXTENT_DATA_KEY or starts after req range */ + if (found_key.type > BTRFS_EXTENT_DATA_KEY || + found_key.offset > end) + break; + + /* + * If the found extent starts after requested offset, then + * adjust extent_end to be right before this extent begins + */ + if (found_key.offset > cur_offset) { + extent_end = found_key.offset; + extent_type = 0; + goto must_cow; + } + + /* + * Found extent which begins before our range and potentially + * intersect it + */ + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + /* If this is triggered then we have a memory corruption. */ + ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES); + if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) { + ret = -EUCLEAN; + goto error; + } + ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + extent_end = btrfs_file_extent_end(path); + + /* + * If the extent we got ends before our current offset, skip to + * the next extent. + */ + if (extent_end <= cur_offset) { + path->slots[0]++; + goto next_slot; + } + + nocow_args.start = cur_offset; + ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args); + if (ret < 0) + goto error; + if (ret == 0) + goto must_cow; + + ret = 0; + nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); + if (!nocow_bg) { +must_cow: + /* + * If we can't perform NOCOW writeback for the range, + * then record the beginning of the range that needs to + * be COWed. It will be written out before the next + * NOCOW range if we find one, or when exiting this + * loop. + */ + if (cow_start == (u64)-1) + cow_start = cur_offset; + cur_offset = extent_end; + if (cur_offset > end) + break; + if (!path->nodes[0]) + continue; + path->slots[0]++; + goto next_slot; + } + + /* + * COW range from cow_start to found_key.offset - 1. As the key + * will contain the beginning of the first extent that can be + * NOCOW, following one which needs to be COW'ed + */ + if (cow_start != (u64)-1) { + ret = fallback_to_cow(inode, locked_page, + cow_start, found_key.offset - 1); + cow_start = (u64)-1; + if (ret) { + btrfs_dec_nocow_writers(nocow_bg); + goto error; + } + } + + nocow_end = cur_offset + nocow_args.num_bytes - 1; + is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; + if (is_prealloc) { + u64 orig_start = found_key.offset - nocow_args.extent_offset; + struct extent_map *em; + + em = create_io_em(inode, cur_offset, nocow_args.num_bytes, + orig_start, + nocow_args.disk_bytenr, /* block_start */ + nocow_args.num_bytes, /* block_len */ + nocow_args.disk_num_bytes, /* orig_block_len */ + ram_bytes, BTRFS_COMPRESS_NONE, + BTRFS_ORDERED_PREALLOC); + if (IS_ERR(em)) { + btrfs_dec_nocow_writers(nocow_bg); + ret = PTR_ERR(em); + goto error; + } + free_extent_map(em); + } + + ordered = btrfs_alloc_ordered_extent(inode, cur_offset, + nocow_args.num_bytes, nocow_args.num_bytes, + nocow_args.disk_bytenr, nocow_args.num_bytes, 0, + is_prealloc + ? (1 << BTRFS_ORDERED_PREALLOC) + : (1 << BTRFS_ORDERED_NOCOW), + BTRFS_COMPRESS_NONE); + btrfs_dec_nocow_writers(nocow_bg); + if (IS_ERR(ordered)) { + if (is_prealloc) { + btrfs_drop_extent_map_range(inode, cur_offset, + nocow_end, false); + } + ret = PTR_ERR(ordered); + goto error; + } + + if (btrfs_is_data_reloc_root(root)) + /* + * Error handled later, as we must prevent + * extent_clear_unlock_delalloc() in error handler + * from freeing metadata of created ordered extent. + */ + ret = btrfs_reloc_clone_csums(ordered); + btrfs_put_ordered_extent(ordered); + + extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, + locked_page, EXTENT_LOCKED | + EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_SET_ORDERED); + + cur_offset = extent_end; + + /* + * btrfs_reloc_clone_csums() error, now we're OK to call error + * handler, as metadata for created ordered extent will only + * be freed by btrfs_finish_ordered_io(). + */ + if (ret) + goto error; + if (cur_offset > end) + break; + } + btrfs_release_path(path); + + if (cur_offset <= end && cow_start == (u64)-1) + cow_start = cur_offset; + + if (cow_start != (u64)-1) { + cur_offset = end; + ret = fallback_to_cow(inode, locked_page, cow_start, end); + cow_start = (u64)-1; + if (ret) + goto error; + } + + btrfs_free_path(path); + return 0; + +error: + /* + * If an error happened while a COW region is outstanding, cur_offset + * needs to be reset to cow_start to ensure the COW region is unlocked + * as well. + */ + if (cow_start != (u64)-1) + cur_offset = cow_start; + if (cur_offset < end) + extent_clear_unlock_delalloc(inode, cur_offset, end, + locked_page, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_DEFRAG | + EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | + PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK); + btrfs_free_path(path); + return ret; +} + +static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) +{ + if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { + if (inode->defrag_bytes && + test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, + 0, NULL)) + return false; + return true; + } + return false; +} + +/* + * Function to process delayed allocation (create CoW) for ranges which are + * being touched for the first time. + */ +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, + u64 start, u64 end, struct writeback_control *wbc) +{ + const bool zoned = btrfs_is_zoned(inode->root->fs_info); + int ret; + + /* + * The range must cover part of the @locked_page, or a return of 1 + * can confuse the caller. + */ + ASSERT(!(end <= page_offset(locked_page) || + start >= page_offset(locked_page) + PAGE_SIZE)); + + if (should_nocow(inode, start, end)) { + ret = run_delalloc_nocow(inode, locked_page, start, end); + goto out; + } + + if (btrfs_inode_can_compress(inode) && + inode_need_compress(inode, start, end) && + run_delalloc_compressed(inode, locked_page, start, end, wbc)) + return 1; + + if (zoned) + ret = run_delalloc_cow(inode, locked_page, start, end, wbc, + true); + else + ret = cow_file_range(inode, locked_page, start, end, NULL, + false, false); + +out: + if (ret < 0) + btrfs_cleanup_ordered_extents(inode, locked_page, start, + end - start + 1); + return ret; +} + +void btrfs_split_delalloc_extent(struct btrfs_inode *inode, + struct extent_state *orig, u64 split) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 size; + + /* not delalloc, ignore it */ + if (!(orig->state & EXTENT_DELALLOC)) + return; + + size = orig->end - orig->start + 1; + if (size > fs_info->max_extent_size) { + u32 num_extents; + u64 new_size; + + /* + * See the explanation in btrfs_merge_delalloc_extent, the same + * applies here, just in reverse. + */ + new_size = orig->end - split + 1; + num_extents = count_max_extents(fs_info, new_size); + new_size = split - orig->start; + num_extents += count_max_extents(fs_info, new_size); + if (count_max_extents(fs_info, size) >= num_extents) + return; + } + + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, 1); + spin_unlock(&inode->lock); +} + +/* + * Handle merged delayed allocation extents so we can keep track of new extents + * that are just merged onto old extents, such as when we are doing sequential + * writes, so we can properly account for the metadata space we'll need. + */ +void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, + struct extent_state *other) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 new_size, old_size; + u32 num_extents; + + /* not delalloc, ignore it */ + if (!(other->state & EXTENT_DELALLOC)) + return; + + if (new->start > other->start) + new_size = new->end - other->start + 1; + else + new_size = other->end - new->start + 1; + + /* we're not bigger than the max, unreserve the space and go */ + if (new_size <= fs_info->max_extent_size) { + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, -1); + spin_unlock(&inode->lock); + return; + } + + /* + * We have to add up either side to figure out how many extents were + * accounted for before we merged into one big extent. If the number of + * extents we accounted for is <= the amount we need for the new range + * then we can return, otherwise drop. Think of it like this + * + * [ 4k][MAX_SIZE] + * + * So we've grown the extent by a MAX_SIZE extent, this would mean we + * need 2 outstanding extents, on one side we have 1 and the other side + * we have 1 so they are == and we can return. But in this case + * + * [MAX_SIZE+4k][MAX_SIZE+4k] + * + * Each range on their own accounts for 2 extents, but merged together + * they are only 3 extents worth of accounting, so we need to drop in + * this case. + */ + old_size = other->end - other->start + 1; + num_extents = count_max_extents(fs_info, old_size); + old_size = new->end - new->start + 1; + num_extents += count_max_extents(fs_info, old_size); + if (count_max_extents(fs_info, new_size) >= num_extents) + return; + + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, -1); + spin_unlock(&inode->lock); +} + +static void btrfs_add_delalloc_inodes(struct btrfs_root *root, + struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + spin_lock(&root->delalloc_lock); + if (list_empty(&inode->delalloc_inodes)) { + list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); + set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags); + root->nr_delalloc_inodes++; + if (root->nr_delalloc_inodes == 1) { + spin_lock(&fs_info->delalloc_root_lock); + BUG_ON(!list_empty(&root->delalloc_root)); + list_add_tail(&root->delalloc_root, + &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); + } + } + spin_unlock(&root->delalloc_lock); +} + +void __btrfs_del_delalloc_inode(struct btrfs_root *root, + struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + if (!list_empty(&inode->delalloc_inodes)) { + list_del_init(&inode->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &inode->runtime_flags); + root->nr_delalloc_inodes--; + if (!root->nr_delalloc_inodes) { + ASSERT(list_empty(&root->delalloc_inodes)); + spin_lock(&fs_info->delalloc_root_lock); + BUG_ON(list_empty(&root->delalloc_root)); + list_del_init(&root->delalloc_root); + spin_unlock(&fs_info->delalloc_root_lock); + } + } +} + +static void btrfs_del_delalloc_inode(struct btrfs_root *root, + struct btrfs_inode *inode) +{ + spin_lock(&root->delalloc_lock); + __btrfs_del_delalloc_inode(root, inode); + spin_unlock(&root->delalloc_lock); +} + +/* + * Properly track delayed allocation bytes in the inode and to maintain the + * list of inodes that have pending delalloc work to be done. + */ +void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, + u32 bits) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) + WARN_ON(1); + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testing for the DELALLOC + * bit, which is only set or cleared with irqs on + */ + if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = inode->root; + u64 len = state->end + 1 - state->start; + u32 num_extents = count_max_extents(fs_info, len); + bool do_list = !btrfs_is_free_space_inode(inode); + + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, num_extents); + spin_unlock(&inode->lock); + + /* For sanity tests */ + if (btrfs_is_testing(fs_info)) + return; + + percpu_counter_add_batch(&fs_info->delalloc_bytes, len, + fs_info->delalloc_batch); + spin_lock(&inode->lock); + inode->delalloc_bytes += len; + if (bits & EXTENT_DEFRAG) + inode->defrag_bytes += len; + if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &inode->runtime_flags)) + btrfs_add_delalloc_inodes(root, inode); + spin_unlock(&inode->lock); + } + + if (!(state->state & EXTENT_DELALLOC_NEW) && + (bits & EXTENT_DELALLOC_NEW)) { + spin_lock(&inode->lock); + inode->new_delalloc_bytes += state->end + 1 - state->start; + spin_unlock(&inode->lock); + } +} + +/* + * Once a range is no longer delalloc this function ensures that proper + * accounting happens. + */ +void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, + struct extent_state *state, u32 bits) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 len = state->end + 1 - state->start; + u32 num_extents = count_max_extents(fs_info, len); + + if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { + spin_lock(&inode->lock); + inode->defrag_bytes -= len; + spin_unlock(&inode->lock); + } + + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testing for the DELALLOC + * bit, which is only set or cleared with irqs on + */ + if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = inode->root; + bool do_list = !btrfs_is_free_space_inode(inode); + + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, -num_extents); + spin_unlock(&inode->lock); + + /* + * We don't reserve metadata space for space cache inodes so we + * don't need to call delalloc_release_metadata if there is an + * error. + */ + if (bits & EXTENT_CLEAR_META_RESV && + root != fs_info->tree_root) + btrfs_delalloc_release_metadata(inode, len, false); + + /* For sanity tests. */ + if (btrfs_is_testing(fs_info)) + return; + + if (!btrfs_is_data_reloc_root(root) && + do_list && !(state->state & EXTENT_NORESERVE) && + (bits & EXTENT_CLEAR_DATA_RESV)) + btrfs_free_reserved_data_space_noquota(fs_info, len); + + percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, + fs_info->delalloc_batch); + spin_lock(&inode->lock); + inode->delalloc_bytes -= len; + if (do_list && inode->delalloc_bytes == 0 && + test_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &inode->runtime_flags)) + btrfs_del_delalloc_inode(root, inode); + spin_unlock(&inode->lock); + } + + if ((state->state & EXTENT_DELALLOC_NEW) && + (bits & EXTENT_DELALLOC_NEW)) { + spin_lock(&inode->lock); + ASSERT(inode->new_delalloc_bytes >= len); + inode->new_delalloc_bytes -= len; + if (bits & EXTENT_ADD_INODE_BYTES) + inode_add_bytes(&inode->vfs_inode, len); + spin_unlock(&inode->lock); + } +} + +static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, + struct btrfs_ordered_extent *ordered) +{ + u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + u64 len = bbio->bio.bi_iter.bi_size; + struct btrfs_ordered_extent *new; + int ret; + + /* Must always be called for the beginning of an ordered extent. */ + if (WARN_ON_ONCE(start != ordered->disk_bytenr)) + return -EINVAL; + + /* No need to split if the ordered extent covers the entire bio. */ + if (ordered->disk_num_bytes == len) { + refcount_inc(&ordered->refs); + bbio->ordered = ordered; + return 0; + } + + /* + * Don't split the extent_map for NOCOW extents, as we're writing into + * a pre-existing one. + */ + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { + ret = split_extent_map(bbio->inode, bbio->file_offset, + ordered->num_bytes, len, + ordered->disk_bytenr); + if (ret) + return ret; + } + + new = btrfs_split_ordered_extent(ordered, len); + if (IS_ERR(new)) + return PTR_ERR(new); + bbio->ordered = new; + return 0; +} + +/* + * given a list of ordered sums record them in the inode. This happens + * at IO completion time based on sums calculated at bio submission time. + */ +static int add_pending_csums(struct btrfs_trans_handle *trans, + struct list_head *list) +{ + struct btrfs_ordered_sum *sum; + struct btrfs_root *csum_root = NULL; + int ret; + + list_for_each_entry(sum, list, list) { + trans->adding_csums = true; + if (!csum_root) + csum_root = btrfs_csum_root(trans->fs_info, + sum->logical); + ret = btrfs_csum_file_blocks(trans, csum_root, sum); + trans->adding_csums = false; + if (ret) + return ret; + } + return 0; +} + +static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, + const u64 start, + const u64 len, + struct extent_state **cached_state) +{ + u64 search_start = start; + const u64 end = start + len - 1; + + while (search_start < end) { + const u64 search_len = end - search_start + 1; + struct extent_map *em; + u64 em_len; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); + if (IS_ERR(em)) + return PTR_ERR(em); + + if (em->block_start != EXTENT_MAP_HOLE) + goto next; + + em_len = em->len; + if (em->start < search_start) + em_len -= search_start - em->start; + if (em_len > search_len) + em_len = search_len; + + ret = set_extent_bit(&inode->io_tree, search_start, + search_start + em_len - 1, + EXTENT_DELALLOC_NEW, cached_state); +next: + search_start = extent_map_end(em); + free_extent_map(em); + if (ret) + return ret; + } + return 0; +} + +int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, + unsigned int extra_bits, + struct extent_state **cached_state) +{ + WARN_ON(PAGE_ALIGNED(end)); + + if (start >= i_size_read(&inode->vfs_inode) && + !(inode->flags & BTRFS_INODE_PREALLOC)) { + /* + * There can't be any extents following eof in this case so just + * set the delalloc new bit for the range directly. + */ + extra_bits |= EXTENT_DELALLOC_NEW; + } else { + int ret; + + ret = btrfs_find_new_delalloc_bytes(inode, start, + end + 1 - start, + cached_state); + if (ret) + return ret; + } + + return set_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC | extra_bits, cached_state); +} + +/* see btrfs_writepage_start_hook for details on why this is required */ +struct btrfs_writepage_fixup { + struct page *page; + struct btrfs_inode *inode; + struct btrfs_work work; +}; + +static void btrfs_writepage_fixup_worker(struct btrfs_work *work) +{ + struct btrfs_writepage_fixup *fixup = + container_of(work, struct btrfs_writepage_fixup, work); + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; + struct page *page = fixup->page; + struct btrfs_inode *inode = fixup->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 page_start = page_offset(page); + u64 page_end = page_offset(page) + PAGE_SIZE - 1; + int ret = 0; + bool free_delalloc_space = true; + + /* + * This is similar to page_mkwrite, we need to reserve the space before + * we take the page lock. + */ + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, + PAGE_SIZE); +again: + lock_page(page); + + /* + * Before we queued this fixup, we took a reference on the page. + * page->mapping may go NULL, but it shouldn't be moved to a different + * address space. + */ + if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { + /* + * Unfortunately this is a little tricky, either + * + * 1) We got here and our page had already been dealt with and + * we reserved our space, thus ret == 0, so we need to just + * drop our space reservation and bail. This can happen the + * first time we come into the fixup worker, or could happen + * while waiting for the ordered extent. + * 2) Our page was already dealt with, but we happened to get an + * ENOSPC above from the btrfs_delalloc_reserve_space. In + * this case we obviously don't have anything to release, but + * because the page was already dealt with we don't want to + * mark the page with an error, so make sure we're resetting + * ret to 0. This is why we have this check _before_ the ret + * check, because we do not want to have a surprise ENOSPC + * when the page was already properly dealt with. + */ + if (!ret) { + btrfs_delalloc_release_extents(inode, PAGE_SIZE); + btrfs_delalloc_release_space(inode, data_reserved, + page_start, PAGE_SIZE, + true); + } + ret = 0; + goto out_page; + } + + /* + * We can't mess with the page state unless it is locked, so now that + * it is locked bail if we failed to make our space reservation. + */ + if (ret) + goto out_page; + + lock_extent(&inode->io_tree, page_start, page_end, &cached_state); + + /* already ordered? We're done */ + if (PageOrdered(page)) + goto out_reserved; + + ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); + if (ordered) { + unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); + unlock_page(page); + btrfs_start_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + goto again; + } + + ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, + &cached_state); + if (ret) + goto out_reserved; + + /* + * Everything went as planned, we're now the owner of a dirty page with + * delayed allocation bits set and space reserved for our COW + * destination. + * + * The page was dirty when we started, nothing should have cleaned it. + */ + BUG_ON(!PageDirty(page)); + free_delalloc_space = false; +out_reserved: + btrfs_delalloc_release_extents(inode, PAGE_SIZE); + if (free_delalloc_space) + btrfs_delalloc_release_space(inode, data_reserved, page_start, + PAGE_SIZE, true); + unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); +out_page: + if (ret) { + /* + * We hit ENOSPC or other errors. Update the mapping and page + * to reflect the errors and clean the page. + */ + mapping_set_error(page->mapping, ret); + btrfs_mark_ordered_io_finished(inode, page, page_start, + PAGE_SIZE, !ret); + clear_page_dirty_for_io(page); + } + btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); + unlock_page(page); + put_page(page); + kfree(fixup); + extent_changeset_free(data_reserved); + /* + * As a precaution, do a delayed iput in case it would be the last iput + * that could need flushing space. Recursing back to fixup worker would + * deadlock. + */ + btrfs_add_delayed_iput(inode); +} + +/* + * There are a few paths in the higher layers of the kernel that directly + * set the page dirty bit without asking the filesystem if it is a + * good idea. This causes problems because we want to make sure COW + * properly happens and the data=ordered rules are followed. + * + * In our case any range that doesn't have the ORDERED bit set + * hasn't been properly setup for IO. We kick off an async process + * to fix it up. The async helper will wait for ordered extents, set + * the delalloc bit and make it safe to write the page. + */ +int btrfs_writepage_cow_fixup(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_writepage_fixup *fixup; + + /* This page has ordered extent covering it already */ + if (PageOrdered(page)) + return 0; + + /* + * PageChecked is set below when we create a fixup worker for this page, + * don't try to create another one if we're already PageChecked() + * + * The extent_io writepage code will redirty the page if we send back + * EAGAIN. + */ + if (PageChecked(page)) + return -EAGAIN; + + fixup = kzalloc(sizeof(*fixup), GFP_NOFS); + if (!fixup) + return -EAGAIN; + + /* + * We are already holding a reference to this inode from + * write_cache_pages. We need to hold it because the space reservation + * takes place outside of the page lock, and we can't trust + * page->mapping outside of the page lock. + */ + ihold(inode); + btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE); + get_page(page); + btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); + fixup->page = page; + fixup->inode = BTRFS_I(inode); + btrfs_queue_work(fs_info->fixup_workers, &fixup->work); + + return -EAGAIN; +} + +static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u64 file_pos, + struct btrfs_file_extent_item *stack_fi, + const bool update_inode_bytes, + u64 qgroup_reserved) +{ + struct btrfs_root *root = inode->root; + const u64 sectorsize = root->fs_info->sectorsize; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key ins; + u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); + u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); + u64 offset = btrfs_stack_file_extent_offset(stack_fi); + u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); + u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); + struct btrfs_drop_extents_args drop_args = { 0 }; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * we may be replacing one extent in the tree with another. + * The new extent is pinned in the extent map, and we don't want + * to drop it from the cache until it is completely in the btree. + * + * So, tell btrfs_drop_extents to leave this extent in the cache. + * the caller is expected to unpin it and allow it to be merged + * with the others. + */ + drop_args.path = path; + drop_args.start = file_pos; + drop_args.end = file_pos + num_bytes; + drop_args.replace_extent = true; + drop_args.extent_item_size = sizeof(*stack_fi); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); + if (ret) + goto out; + + if (!drop_args.extent_inserted) { + ins.objectid = btrfs_ino(inode); + ins.offset = file_pos; + ins.type = BTRFS_EXTENT_DATA_KEY; + + ret = btrfs_insert_empty_item(trans, root, path, &ins, + sizeof(*stack_fi)); + if (ret) + goto out; + } + leaf = path->nodes[0]; + btrfs_set_stack_file_extent_generation(stack_fi, trans->transid); + write_extent_buffer(leaf, stack_fi, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(struct btrfs_file_extent_item)); + + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_release_path(path); + + /* + * If we dropped an inline extent here, we know the range where it is + * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the + * number of bytes only for that range containing the inline extent. + * The remaining of the range will be processed when clearning the + * EXTENT_DELALLOC_BIT bit through the ordered extent completion. + */ + if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) { + u64 inline_size = round_down(drop_args.bytes_found, sectorsize); + + inline_size = drop_args.bytes_found - inline_size; + btrfs_update_inode_bytes(inode, sectorsize, inline_size); + drop_args.bytes_found -= inline_size; + num_bytes -= sectorsize; + } + + if (update_inode_bytes) + btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); + + ins.objectid = disk_bytenr; + ins.offset = disk_num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); + if (ret) + goto out; + + ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), + file_pos - offset, + qgroup_reserved, &ins); +out: + btrfs_free_path(path); + + return ret; +} + +static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, + u64 start, u64 len) +{ + struct btrfs_block_group *cache; + + cache = btrfs_lookup_block_group(fs_info, start); + ASSERT(cache); + + spin_lock(&cache->lock); + cache->delalloc_bytes -= len; + spin_unlock(&cache->lock); + + btrfs_put_block_group(cache); +} + +static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_ordered_extent *oe) +{ + struct btrfs_file_extent_item stack_fi; + bool update_inode_bytes; + u64 num_bytes = oe->num_bytes; + u64 ram_bytes = oe->ram_bytes; + + memset(&stack_fi, 0, sizeof(stack_fi)); + btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); + btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, + oe->disk_num_bytes); + btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); + if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) { + num_bytes = oe->truncated_len; + ram_bytes = num_bytes; + } + btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); + btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); + btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); + /* Encryption and other encoding is reserved and all 0 */ + + /* + * For delalloc, when completing an ordered extent we update the inode's + * bytes when clearing the range in the inode's io tree, so pass false + * as the argument 'update_inode_bytes' to insert_reserved_file_extent(), + * except if the ordered extent was truncated. + */ + update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || + test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || + test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); + + return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), + oe->file_offset, &stack_fi, + update_inode_bytes, oe->qgroup_rsv); +} + +/* + * As ordered data IO finishes, this gets called so we can finish + * an ordered extent if the range of bytes in the file it covers are + * fully written. + */ +int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) +{ + struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_trans_handle *trans = NULL; + struct extent_io_tree *io_tree = &inode->io_tree; + struct extent_state *cached_state = NULL; + u64 start, end; + int compress_type = 0; + int ret = 0; + u64 logical_len = ordered_extent->num_bytes; + bool freespace_inode; + bool truncated = false; + bool clear_reserved_extent = true; + unsigned int clear_bits = EXTENT_DEFRAG; + + start = ordered_extent->file_offset; + end = start + ordered_extent->num_bytes - 1; + + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags)) + clear_bits |= EXTENT_DELALLOC_NEW; + + freespace_inode = btrfs_is_free_space_inode(inode); + if (!freespace_inode) + btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent); + + if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { + ret = -EIO; + goto out; + } + + if (btrfs_is_zoned(fs_info)) + btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); + + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { + truncated = true; + logical_len = ordered_extent->truncated_len; + /* Truncated the entire extent, don't bother adding */ + if (!logical_len) + goto out; + } + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ + + btrfs_inode_safe_disk_i_size_write(inode, 0); + if (freespace_inode) + trans = btrfs_join_transaction_spacecache(root); + else + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + trans->block_rsv = &inode->block_rsv; + ret = btrfs_update_inode_fallback(trans, root, inode); + if (ret) /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, ret); + goto out; + } + + clear_bits |= EXTENT_LOCKED; + lock_extent(io_tree, start, end, &cached_state); + + if (freespace_inode) + trans = btrfs_join_transaction_spacecache(root); + else + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + + trans->block_rsv = &inode->block_rsv; + + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) + compress_type = ordered_extent->compress_type; + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { + BUG_ON(compress_type); + ret = btrfs_mark_extent_written(trans, inode, + ordered_extent->file_offset, + ordered_extent->file_offset + + logical_len); + btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); + } else { + BUG_ON(root == fs_info->tree_root); + ret = insert_ordered_extent_file_extent(trans, ordered_extent); + if (!ret) { + clear_reserved_extent = false; + btrfs_release_delalloc_bytes(fs_info, + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); + } + } + unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset, + ordered_extent->num_bytes, trans->transid); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = add_pending_csums(trans, &ordered_extent->list); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + /* + * If this is a new delalloc range, clear its new delalloc flag to + * update the inode's number of bytes. This needs to be done first + * before updating the inode item. + */ + if ((clear_bits & EXTENT_DELALLOC_NEW) && + !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) + clear_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, + &cached_state); + + btrfs_inode_safe_disk_i_size_write(inode, 0); + ret = btrfs_update_inode_fallback(trans, root, inode); + if (ret) { /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, ret); + goto out; + } + ret = 0; +out: + clear_extent_bit(&inode->io_tree, start, end, clear_bits, + &cached_state); + + if (trans) + btrfs_end_transaction(trans); + + if (ret || truncated) { + u64 unwritten_start = start; + + /* + * If we failed to finish this ordered extent for any reason we + * need to make sure BTRFS_ORDERED_IOERR is set on the ordered + * extent, and mark the inode with the error if it wasn't + * already set. Any error during writeback would have already + * set the mapping error, so we need to set it if we're the ones + * marking this ordered extent as failed. + */ + if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR, + &ordered_extent->flags)) + mapping_set_error(ordered_extent->inode->i_mapping, -EIO); + + if (truncated) + unwritten_start += logical_len; + clear_extent_uptodate(io_tree, unwritten_start, end, NULL); + + /* Drop extent maps for the part of the extent we didn't write. */ + btrfs_drop_extent_map_range(inode, unwritten_start, end, false); + + /* + * If the ordered extent had an IOERR or something else went + * wrong we need to return the space for this ordered extent + * back to the allocator. We only free the extent in the + * truncated case if we didn't write out the extent at all. + * + * If we made it past insert_reserved_file_extent before we + * errored out then we don't need to do this as the accounting + * has already been done. + */ + if ((ret || !logical_len) && + clear_reserved_extent && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { + /* + * Discard the range before returning it back to the + * free space pool + */ + if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC)) + btrfs_discard_extent(fs_info, + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes, + NULL); + btrfs_free_reserved_extent(fs_info, + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes, 1); + /* + * Actually free the qgroup rsv which was released when + * the ordered extent was created. + */ + btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid, + ordered_extent->qgroup_rsv, + BTRFS_QGROUP_RSV_DATA); + } + } + + /* + * This needs to be done to make sure anybody waiting knows we are done + * updating everything for this ordered extent. + */ + btrfs_remove_ordered_extent(inode, ordered_extent); + + /* once for us */ + btrfs_put_ordered_extent(ordered_extent); + /* once for the tree */ + btrfs_put_ordered_extent(ordered_extent); + + return ret; +} + +int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) +{ + if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) && + !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + btrfs_finish_ordered_zoned(ordered); + return btrfs_finish_one_ordered(ordered); +} + +/* + * Verify the checksum for a single sector without any extra action that depend + * on the type of I/O. + */ +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected) +{ + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + char *kaddr; + + ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE); + + shash->tfm = fs_info->csum_shash; + + kaddr = kmap_local_page(page) + pgoff; + crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); + kunmap_local(kaddr); + + if (memcmp(csum, csum_expected, fs_info->csum_size)) + return -EIO; + return 0; +} + +/* + * Verify the checksum of a single data sector. + * + * @bbio: btrfs_io_bio which contains the csum + * @dev: device the sector is on + * @bio_offset: offset to the beginning of the bio (in bytes) + * @bv: bio_vec to check + * + * Check if the checksum on a data block is valid. When a checksum mismatch is + * detected, report the error and fill the corrupted range with zero. + * + * Return %true if the sector is ok or had no checksum to start with, else %false. + */ +bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, + u32 bio_offset, struct bio_vec *bv) +{ + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 file_offset = bbio->file_offset + bio_offset; + u64 end = file_offset + bv->bv_len - 1; + u8 *csum_expected; + u8 csum[BTRFS_CSUM_SIZE]; + + ASSERT(bv->bv_len == fs_info->sectorsize); + + if (!bbio->csum) + return true; + + if (btrfs_is_data_reloc_root(inode->root) && + test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, + 1, NULL)) { + /* Skip the range without csum for data reloc inode */ + clear_extent_bits(&inode->io_tree, file_offset, end, + EXTENT_NODATASUM); + return true; + } + + csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * + fs_info->csum_size; + if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, + csum_expected)) + goto zeroit; + return true; + +zeroit: + btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected, + bbio->mirror_num); + if (dev) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); + memzero_bvec(bv); + return false; +} + +/* + * btrfs_add_delayed_iput - perform a delayed iput on @inode + * + * @inode: The inode we want to perform iput on + * + * This function uses the generic vfs_inode::i_count to track whether we should + * just decrement it (in case it's > 1) or if this is the last iput then link + * the inode to the delayed iput machinery. Delayed iputs are processed at + * transaction commit time/superblock commit/cleaner kthread. + */ +void btrfs_add_delayed_iput(struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + unsigned long flags; + + if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) + return; + + atomic_inc(&fs_info->nr_delayed_iputs); + /* + * Need to be irq safe here because we can be called from either an irq + * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq + * context. + */ + spin_lock_irqsave(&fs_info->delayed_iput_lock, flags); + ASSERT(list_empty(&inode->delayed_iput)); + list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs); + spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags); + if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) + wake_up_process(fs_info->cleaner_kthread); +} + +static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) +{ + list_del_init(&inode->delayed_iput); + spin_unlock_irq(&fs_info->delayed_iput_lock); + iput(&inode->vfs_inode); + if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) + wake_up(&fs_info->delayed_iputs_wait); + spin_lock_irq(&fs_info->delayed_iput_lock); +} + +static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) +{ + if (!list_empty(&inode->delayed_iput)) { + spin_lock_irq(&fs_info->delayed_iput_lock); + if (!list_empty(&inode->delayed_iput)) + run_delayed_iput_locked(fs_info, inode); + spin_unlock_irq(&fs_info->delayed_iput_lock); + } +} + +void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) +{ + /* + * btrfs_put_ordered_extent() can run in irq context (see bio.c), which + * calls btrfs_add_delayed_iput() and that needs to lock + * fs_info->delayed_iput_lock. So we need to disable irqs here to + * prevent a deadlock. + */ + spin_lock_irq(&fs_info->delayed_iput_lock); + while (!list_empty(&fs_info->delayed_iputs)) { + struct btrfs_inode *inode; + + inode = list_first_entry(&fs_info->delayed_iputs, + struct btrfs_inode, delayed_iput); + run_delayed_iput_locked(fs_info, inode); + if (need_resched()) { + spin_unlock_irq(&fs_info->delayed_iput_lock); + cond_resched(); + spin_lock_irq(&fs_info->delayed_iput_lock); + } + } + spin_unlock_irq(&fs_info->delayed_iput_lock); +} + +/* + * Wait for flushing all delayed iputs + * + * @fs_info: the filesystem + * + * This will wait on any delayed iputs that are currently running with KILLABLE + * set. Once they are all done running we will return, unless we are killed in + * which case we return EINTR. This helps in user operations like fallocate etc + * that might get blocked on the iputs. + * + * Return EINTR if we were killed, 0 if nothing's pending + */ +int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info) +{ + int ret = wait_event_killable(fs_info->delayed_iputs_wait, + atomic_read(&fs_info->nr_delayed_iputs) == 0); + if (ret) + return -EINTR; + return 0; +} + +/* + * This creates an orphan entry for the given inode in case something goes wrong + * in the middle of an unlink. + */ +int btrfs_orphan_add(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) +{ + int ret; + + ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + return 0; +} + +/* + * We have done the delete so we can go ahead and remove the orphan item for + * this particular inode. + */ +static int btrfs_orphan_del(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) +{ + return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode)); +} + +/* + * this cleans up any orphans that may be left on the list from the last use + * of this root. + */ +int btrfs_orphan_cleanup(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key, found_key; + struct btrfs_trans_handle *trans; + struct inode *inode; + u64 last_objectid = 0; + int ret = 0, nr_unlink = 0; + + if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) + return 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + path->reada = READA_BACK; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + /* + * if ret == 0 means we found what we were searching for, which + * is weird, but possible, so only screw with path if we didn't + * find the key and see if we have stuff that matches + */ + if (ret > 0) { + ret = 0; + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + /* pull out the item */ + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + /* make sure the item matches what we want */ + if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) + break; + if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) + break; + + /* release the path since we're done with it */ + btrfs_release_path(path); + + /* + * this is where we are basically btrfs_lookup, without the + * crossing root thing. we store the inode number in the + * offset of the orphan item. + */ + + if (found_key.offset == last_objectid) { + /* + * We found the same inode as before. This means we were + * not able to remove its items via eviction triggered + * by an iput(). A transaction abort may have happened, + * due to -ENOSPC for example, so try to grab the error + * that lead to a transaction abort, if any. + */ + btrfs_err(fs_info, + "Error removing orphan entry, stopping orphan cleanup"); + ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL; + goto out; + } + + last_objectid = found_key.offset; + + found_key.objectid = found_key.offset; + found_key.type = BTRFS_INODE_ITEM_KEY; + found_key.offset = 0; + inode = btrfs_iget(fs_info->sb, last_objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; + if (ret != -ENOENT) + goto out; + } + + if (!inode && root == fs_info->tree_root) { + struct btrfs_root *dead_root; + int is_dead_root = 0; + + /* + * This is an orphan in the tree root. Currently these + * could come from 2 sources: + * a) a root (snapshot/subvolume) deletion in progress + * b) a free space cache inode + * We need to distinguish those two, as the orphan item + * for a root must not get deleted before the deletion + * of the snapshot/subvolume's tree completes. + * + * btrfs_find_orphan_roots() ran before us, which has + * found all deleted roots and loaded them into + * fs_info->fs_roots_radix. So here we can find if an + * orphan item corresponds to a deleted root by looking + * up the root from that radix tree. + */ + + spin_lock(&fs_info->fs_roots_radix_lock); + dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)found_key.objectid); + if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) + is_dead_root = 1; + spin_unlock(&fs_info->fs_roots_radix_lock); + + if (is_dead_root) { + /* prevent this orphan from being found again */ + key.offset = found_key.objectid - 1; + continue; + } + + } + + /* + * If we have an inode with links, there are a couple of + * possibilities: + * + * 1. We were halfway through creating fsverity metadata for the + * file. In that case, the orphan item represents incomplete + * fsverity metadata which must be cleaned up with + * btrfs_drop_verity_items and deleting the orphan item. + + * 2. Old kernels (before v3.12) used to create an + * orphan item for truncate indicating that there were possibly + * extent items past i_size that needed to be deleted. In v3.12, + * truncate was changed to update i_size in sync with the extent + * items, but the (useless) orphan item was still created. Since + * v4.18, we don't create the orphan item for truncate at all. + * + * So, this item could mean that we need to do a truncate, but + * only if this filesystem was last used on a pre-v3.12 kernel + * and was not cleanly unmounted. The odds of that are quite + * slim, and it's a pain to do the truncate now, so just delete + * the orphan item. + * + * It's also possible that this orphan item was supposed to be + * deleted but wasn't. The inode number may have been reused, + * but either way, we can delete the orphan item. + */ + if (!inode || inode->i_nlink) { + if (inode) { + ret = btrfs_drop_verity_items(BTRFS_I(inode)); + iput(inode); + inode = NULL; + if (ret) + goto out; + } + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + btrfs_debug(fs_info, "auto deleting %Lu", + found_key.objectid); + ret = btrfs_del_orphan_item(trans, root, + found_key.objectid); + btrfs_end_transaction(trans); + if (ret) + goto out; + continue; + } + + nr_unlink++; + + /* this will do delete_inode and everything for us */ + iput(inode); + } + /* release the path since we're done with it */ + btrfs_release_path(path); + + if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { + trans = btrfs_join_transaction(root); + if (!IS_ERR(trans)) + btrfs_end_transaction(trans); + } + + if (nr_unlink) + btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink); + +out: + if (ret) + btrfs_err(fs_info, "could not do orphan cleanup %d", ret); + btrfs_free_path(path); + return ret; +} + +/* + * very simple check to peek ahead in the leaf looking for xattrs. If we + * don't find any xattrs, we know there can't be any acls. + * + * slot is the slot the inode is in, objectid is the objectid of the inode + */ +static noinline int acls_after_inode_item(struct extent_buffer *leaf, + int slot, u64 objectid, + int *first_xattr_slot) +{ + u32 nritems = btrfs_header_nritems(leaf); + struct btrfs_key found_key; + static u64 xattr_access = 0; + static u64 xattr_default = 0; + int scanned = 0; + + if (!xattr_access) { + xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS, + strlen(XATTR_NAME_POSIX_ACL_ACCESS)); + xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT, + strlen(XATTR_NAME_POSIX_ACL_DEFAULT)); + } + + slot++; + *first_xattr_slot = -1; + while (slot < nritems) { + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* we found a different objectid, there must not be acls */ + if (found_key.objectid != objectid) + return 0; + + /* we found an xattr, assume we've got an acl */ + if (found_key.type == BTRFS_XATTR_ITEM_KEY) { + if (*first_xattr_slot == -1) + *first_xattr_slot = slot; + if (found_key.offset == xattr_access || + found_key.offset == xattr_default) + return 1; + } + + /* + * we found a key greater than an xattr key, there can't + * be any acls later on + */ + if (found_key.type > BTRFS_XATTR_ITEM_KEY) + return 0; + + slot++; + scanned++; + + /* + * it goes inode, inode backrefs, xattrs, extents, + * so if there are a ton of hard links to an inode there can + * be a lot of backrefs. Don't waste time searching too hard, + * this is just an optimization + */ + if (scanned >= 8) + break; + } + /* we hit the end of the leaf before we found an xattr or + * something larger than an xattr. We have to assume the inode + * has acls + */ + if (*first_xattr_slot == -1) + *first_xattr_slot = slot; + return 1; +} + +/* + * read an inode from the btree into the in-memory inode + */ +static int btrfs_read_locked_inode(struct inode *inode, + struct btrfs_path *in_path) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_path *path = in_path; + struct extent_buffer *leaf; + struct btrfs_inode_item *inode_item; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key location; + unsigned long ptr; + int maybe_acls; + u32 rdev; + int ret; + bool filled = false; + int first_xattr_slot; + + ret = btrfs_fill_inode(inode, &rdev); + if (!ret) + filled = true; + + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } + + memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); + + ret = btrfs_lookup_inode(NULL, root, path, &location, 0); + if (ret) { + if (path != in_path) + btrfs_free_path(path); + return ret; + } + + leaf = path->nodes[0]; + + if (filled) + goto cache_index; + + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + inode->i_mode = btrfs_inode_mode(leaf, inode_item); + set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); + i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); + i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); + btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); + btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, + round_up(i_size_read(inode), fs_info->sectorsize)); + + inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); + inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); + + inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); + inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); + + inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), + btrfs_timespec_nsec(leaf, &inode_item->ctime)); + + BTRFS_I(inode)->i_otime.tv_sec = + btrfs_timespec_sec(leaf, &inode_item->otime); + BTRFS_I(inode)->i_otime.tv_nsec = + btrfs_timespec_nsec(leaf, &inode_item->otime); + + inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); + BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); + BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + + inode_set_iversion_queried(inode, + btrfs_inode_sequence(leaf, inode_item)); + inode->i_generation = BTRFS_I(inode)->generation; + inode->i_rdev = 0; + rdev = btrfs_inode_rdev(leaf, inode_item); + + BTRFS_I(inode)->index_cnt = (u64)-1; + btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), + &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); + +cache_index: + /* + * If we were modified in the current generation and evicted from memory + * and then re-read we need to do a full sync since we don't have any + * idea about which extents were modified before we were evicted from + * cache. + * + * This is required for both inode re-read from disk and delayed inode + * in delayed_nodes_tree. + */ + if (BTRFS_I(inode)->last_trans == fs_info->generation) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + + /* + * We don't persist the id of the transaction where an unlink operation + * against the inode was last made. So here we assume the inode might + * have been evicted, and therefore the exact value of last_unlink_trans + * lost, and set it to last_trans to avoid metadata inconsistencies + * between the inode and its parent if the inode is fsync'ed and the log + * replayed. For example, in the scenario: + * + * touch mydir/foo + * ln mydir/foo mydir/bar + * sync + * unlink mydir/bar + * echo 2 > /proc/sys/vm/drop_caches # evicts inode + * xfs_io -c fsync mydir/foo + * + * mount fs, triggers fsync log replay + * + * We must make sure that when we fsync our inode foo we also log its + * parent inode, otherwise after log replay the parent still has the + * dentry with the "bar" name but our inode foo has a link count of 1 + * and doesn't have an inode ref with the name "bar" anymore. + * + * Setting last_unlink_trans to last_trans is a pessimistic approach, + * but it guarantees correctness at the expense of occasional full + * transaction commits on fsync if our inode is a directory, or if our + * inode is not a directory, logging its parent unnecessarily. + */ + BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + + /* + * Same logic as for last_unlink_trans. We don't persist the generation + * of the last transaction where this inode was used for a reflink + * operation, so after eviction and reloading the inode we must be + * pessimistic and assume the last transaction that modified the inode. + */ + BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; + + path->slots[0]++; + if (inode->i_nlink != 1 || + path->slots[0] >= btrfs_header_nritems(leaf)) + goto cache_acl; + + btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); + if (location.objectid != btrfs_ino(BTRFS_I(inode))) + goto cache_acl; + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + if (location.type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ptr; + BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); + } else if (location.type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *)ptr; + BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, + extref); + } +cache_acl: + /* + * try to precache a NULL acl entry for files that don't have + * any xattrs or acls + */ + maybe_acls = acls_after_inode_item(leaf, path->slots[0], + btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); + if (first_xattr_slot != -1) { + path->slots[0] = first_xattr_slot; + ret = btrfs_load_inode_props(inode, path); + if (ret) + btrfs_err(fs_info, + "error loading props for ino %llu (root %llu): %d", + btrfs_ino(BTRFS_I(inode)), + root->root_key.objectid, ret); + } + if (path != in_path) + btrfs_free_path(path); + + if (!maybe_acls) + cache_no_acl(inode); + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + break; + case S_IFDIR: + inode->i_fop = &btrfs_dir_file_operations; + inode->i_op = &btrfs_dir_inode_operations; + break; + case S_IFLNK: + inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &btrfs_aops; + break; + default: + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + break; + } + + btrfs_sync_inode_flags_to_i_flags(inode); + return 0; +} + +/* + * given a leaf and an inode, copy the inode fields into the leaf + */ +static void fill_inode_item(struct btrfs_trans_handle *trans, + struct extent_buffer *leaf, + struct btrfs_inode_item *item, + struct inode *inode) +{ + struct btrfs_map_token token; + u64 flags; + + btrfs_init_map_token(&token, leaf); + + btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); + btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); + btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size); + btrfs_set_token_inode_mode(&token, item, inode->i_mode); + btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); + + btrfs_set_token_timespec_sec(&token, &item->atime, + inode->i_atime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->atime, + inode->i_atime.tv_nsec); + + btrfs_set_token_timespec_sec(&token, &item->mtime, + inode->i_mtime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->mtime, + inode->i_mtime.tv_nsec); + + btrfs_set_token_timespec_sec(&token, &item->ctime, + inode_get_ctime(inode).tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->ctime, + inode_get_ctime(inode).tv_nsec); + + btrfs_set_token_timespec_sec(&token, &item->otime, + BTRFS_I(inode)->i_otime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->otime, + BTRFS_I(inode)->i_otime.tv_nsec); + + btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); + btrfs_set_token_inode_generation(&token, item, + BTRFS_I(inode)->generation); + btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); + btrfs_set_token_inode_transid(&token, item, trans->transid); + btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); + flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, + BTRFS_I(inode)->ro_flags); + btrfs_set_token_inode_flags(&token, item, flags); + btrfs_set_token_inode_block_group(&token, item, 0); +} + +/* + * copy everything in the in-memory inode into the btree. + */ +static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_inode *inode) +{ + struct btrfs_inode_item *inode_item; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto failed; + } + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + + fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_set_inode_last_trans(trans, inode); + ret = 0; +failed: + btrfs_free_path(path); + return ret; +} + +/* + * copy everything in the in-memory inode into the btree. + */ +noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + /* + * If the inode is a free space inode, we can deadlock during commit + * if we put it into the delayed code. + * + * The data relocation inode should also be directly updated + * without delay + */ + if (!btrfs_is_free_space_inode(inode) + && !btrfs_is_data_reloc_root(root) + && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { + btrfs_update_root_times(trans, root); + + ret = btrfs_delayed_update_inode(trans, root, inode); + if (!ret) + btrfs_set_inode_last_trans(trans, inode); + return ret; + } + + return btrfs_update_inode_item(trans, root, inode); +} + +int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode) +{ + int ret; + + ret = btrfs_update_inode(trans, root, inode); + if (ret == -ENOSPC) + return btrfs_update_inode_item(trans, root, inode); + return ret; +} + +/* + * unlink helper that gets used here in inode.c and in the tree logging + * recovery code. It remove a link in a directory with a given name, and + * also drops the back refs in the inode to the directory + */ +static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, + struct btrfs_inode *inode, + const struct fscrypt_str *name, + struct btrfs_rename_ctx *rename_ctx) +{ + struct btrfs_root *root = dir->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + int ret = 0; + struct btrfs_dir_item *di; + u64 index; + u64 ino = btrfs_ino(inode); + u64 dir_ino = btrfs_ino(dir); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); + if (IS_ERR_OR_NULL(di)) { + ret = di ? PTR_ERR(di) : -ENOENT; + goto err; + } + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto err; + btrfs_release_path(path); + + /* + * If we don't have dir index, we have to get it by looking up + * the inode ref, since we get the inode ref, remove it directly, + * it is unnecessary to do delayed deletion. + * + * But if we have dir index, needn't search inode ref to get it. + * Since the inode ref is close to the inode item, it is better + * that we delay to delete it, and just do this deletion when + * we update the inode item. + */ + if (inode->dir_index) { + ret = btrfs_delayed_delete_inode_ref(inode); + if (!ret) { + index = inode->dir_index; + goto skip_backref; + } + } + + ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); + if (ret) { + btrfs_info(fs_info, + "failed to delete reference to %.*s, inode %llu parent %llu", + name->len, name->name, ino, dir_ino); + btrfs_abort_transaction(trans, ret); + goto err; + } +skip_backref: + if (rename_ctx) + rename_ctx->index = index; + + ret = btrfs_delete_delayed_dir_index(trans, dir, index); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto err; + } + + /* + * If we are in a rename context, we don't need to update anything in the + * log. That will be done later during the rename by btrfs_log_new_name(). + * Besides that, doing it here would only cause extra unnecessary btree + * operations on the log tree, increasing latency for applications. + */ + if (!rename_ctx) { + btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino); + btrfs_del_dir_entries_in_log(trans, root, name, dir, index); + } + + /* + * If we have a pending delayed iput we could end up with the final iput + * being run in btrfs-cleaner context. If we have enough of these built + * up we can end up burning a lot of time in btrfs-cleaner without any + * way to throttle the unlinks. Since we're currently holding a ref on + * the inode we can run the delayed iput here without any issues as the + * final iput won't be done until after we drop the ref we're currently + * holding. + */ + btrfs_run_delayed_iput(fs_info, inode); +err: + btrfs_free_path(path); + if (ret) + goto out; + + btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); + inode_inc_iversion(&inode->vfs_inode); + inode_inc_iversion(&dir->vfs_inode); + inode_set_ctime_current(&inode->vfs_inode); + dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode); + ret = btrfs_update_inode(trans, root, dir); +out: + return ret; +} + +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, struct btrfs_inode *inode, + const struct fscrypt_str *name) +{ + int ret; + + ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL); + if (!ret) { + drop_nlink(&inode->vfs_inode); + ret = btrfs_update_inode(trans, inode->root, inode); + } + return ret; +} + +/* + * helper to start transaction for unlink and rmdir. + * + * unlink and rmdir are special in btrfs, they do not always free space, so + * if we cannot make our reservations the normal way try and see if there is + * plenty of slack room in the global reserve to migrate, otherwise we cannot + * allow the unlink to occur. + */ +static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir) +{ + struct btrfs_root *root = dir->root; + + return btrfs_start_transaction_fallback_global_rsv(root, + BTRFS_UNLINK_METADATA_UNITS); +} + +static int btrfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct btrfs_trans_handle *trans; + struct inode *inode = d_inode(dentry); + int ret; + struct fscrypt_name fname; + + ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (ret) + return ret; + + /* This needs to handle no-key deletions later on */ + + trans = __unlink_start_trans(BTRFS_I(dir)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto fscrypt_free; + } + + btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), + false); + + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), + &fname.disk_name); + if (ret) + goto end_trans; + + if (inode->i_nlink == 0) { + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (ret) + goto end_trans; + } + +end_trans: + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); +fscrypt_free: + fscrypt_free_filename(&fname); + return ret; +} + +static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, struct dentry *dentry) +{ + struct btrfs_root *root = dir->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u64 index; + int ret; + u64 objectid; + u64 dir_ino = btrfs_ino(dir); + struct fscrypt_name fname; + + ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); + if (ret) + return ret; + + /* This needs to handle no-key deletions later on */ + + if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { + objectid = inode->root->root_key.objectid; + } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { + objectid = inode->location.objectid; + } else { + WARN_ON(1); + fscrypt_free_filename(&fname); + return -EINVAL; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, + &fname.disk_name, -1); + if (IS_ERR_OR_NULL(di)) { + ret = di ? PTR_ERR(di) : -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); + WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + btrfs_release_path(path); + + /* + * This is a placeholder inode for a subvolume we didn't have a + * reference to at the time of the snapshot creation. In the meantime + * we could have renamed the real subvol link into our snapshot, so + * depending on btrfs_del_root_ref to return -ENOENT here is incorrect. + * Instead simply lookup the dir_index_item for this entry so we can + * remove it. Otherwise we know we have a ref to the root and we can + * call btrfs_del_root_ref, and it _shouldn't_ fail. + */ + if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { + di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name); + if (IS_ERR_OR_NULL(di)) { + if (!di) + ret = -ENOENT; + else + ret = PTR_ERR(di); + btrfs_abort_transaction(trans, ret); + goto out; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + index = key.offset; + btrfs_release_path(path); + } else { + ret = btrfs_del_root_ref(trans, objectid, + root->root_key.objectid, dir_ino, + &index, &fname.disk_name); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + } + + ret = btrfs_delete_delayed_dir_index(trans, dir, index); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2); + inode_inc_iversion(&dir->vfs_inode); + dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode); + ret = btrfs_update_inode_fallback(trans, root, dir); + if (ret) + btrfs_abort_transaction(trans, ret); +out: + btrfs_free_path(path); + fscrypt_free_filename(&fname); + return ret; +} + +/* + * Helper to check if the subvolume references other subvolumes or if it's + * default. + */ +static noinline int may_destroy_subvol(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + struct btrfs_dir_item *di; + struct btrfs_key key; + struct fscrypt_str name = FSTR_INIT("default", 7); + u64 dir_id; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* Make sure this root isn't set as the default subvol */ + dir_id = btrfs_super_root_dir(fs_info->super_copy); + di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, + dir_id, &name, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); + if (key.objectid == root->root_key.objectid) { + ret = -EPERM; + btrfs_err(fs_info, + "deleting default subvolume %llu is not allowed", + key.objectid); + goto out; + } + btrfs_release_path(path); + } + + key.objectid = root->root_key.objectid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + ret = 0; + if (path->slots[0] > 0) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == root->root_key.objectid && + key.type == BTRFS_ROOT_REF_KEY) + ret = -ENOTEMPTY; + } +out: + btrfs_free_path(path); + return ret; +} + +/* Delete all dentries for inodes belonging to the root */ +static void btrfs_prune_dentries(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct rb_node *node; + struct rb_node *prev; + struct btrfs_inode *entry; + struct inode *inode; + u64 objectid = 0; + + if (!BTRFS_FS_ERROR(fs_info)) + WARN_ON(btrfs_root_refs(&root->root_item) != 0); + + spin_lock(&root->inode_lock); +again: + node = root->inode_tree.rb_node; + prev = NULL; + while (node) { + prev = node; + entry = rb_entry(node, struct btrfs_inode, rb_node); + + if (objectid < btrfs_ino(entry)) + node = node->rb_left; + else if (objectid > btrfs_ino(entry)) + node = node->rb_right; + else + break; + } + if (!node) { + while (prev) { + entry = rb_entry(prev, struct btrfs_inode, rb_node); + if (objectid <= btrfs_ino(entry)) { + node = prev; + break; + } + prev = rb_next(prev); + } + } + while (node) { + entry = rb_entry(node, struct btrfs_inode, rb_node); + objectid = btrfs_ino(entry) + 1; + inode = igrab(&entry->vfs_inode); + if (inode) { + spin_unlock(&root->inode_lock); + if (atomic_read(&inode->i_count) > 1) + d_prune_aliases(inode); + /* + * btrfs_drop_inode will have it removed from the inode + * cache when its usage count hits zero. + */ + iput(inode); + cond_resched(); + spin_lock(&root->inode_lock); + goto again; + } + + if (cond_resched_lock(&root->inode_lock)) + goto again; + + node = rb_next(node); + } + spin_unlock(&root->inode_lock); +} + +int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); + struct btrfs_root *root = dir->root; + struct inode *inode = d_inode(dentry); + struct btrfs_root *dest = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_block_rsv block_rsv; + u64 root_flags; + int ret; + + down_write(&fs_info->subvol_sem); + + /* + * Don't allow to delete a subvolume with send in progress. This is + * inside the inode lock so the error handling that has to drop the bit + * again is not run concurrently. + */ + spin_lock(&dest->root_item_lock); + if (dest->send_in_progress) { + spin_unlock(&dest->root_item_lock); + btrfs_warn(fs_info, + "attempt to delete subvolume %llu during send", + dest->root_key.objectid); + ret = -EPERM; + goto out_up_write; + } + if (atomic_read(&dest->nr_swapfiles)) { + spin_unlock(&dest->root_item_lock); + btrfs_warn(fs_info, + "attempt to delete subvolume %llu with active swapfile", + root->root_key.objectid); + ret = -EPERM; + goto out_up_write; + } + root_flags = btrfs_root_flags(&dest->root_item); + btrfs_set_root_flags(&dest->root_item, + root_flags | BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + + ret = may_destroy_subvol(dest); + if (ret) + goto out_undead; + + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); + /* + * One for dir inode, + * two for dir entries, + * two for root ref/backref. + */ + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); + if (ret) + goto out_undead; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_release; + } + trans->block_rsv = &block_rsv; + trans->bytes_reserved = block_rsv.size; + + btrfs_record_snapshot_destroy(trans, dir); + + ret = btrfs_unlink_subvol(trans, dir, dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } + + ret = btrfs_record_root_in_trans(trans, dest); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } + + memset(&dest->root_item.drop_progress, 0, + sizeof(dest->root_item.drop_progress)); + btrfs_set_root_drop_level(&dest->root_item, 0); + btrfs_set_root_refs(&dest->root_item, 0); + + if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { + ret = btrfs_insert_orphan_item(trans, + fs_info->tree_root, + dest->root_key.objectid); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } + } + + ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, + BTRFS_UUID_KEY_SUBVOL, + dest->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } + if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { + ret = btrfs_uuid_tree_remove(trans, + dest->root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + dest->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } + } + + free_anon_bdev(dest->anon_dev); + dest->anon_dev = 0; +out_end_trans: + trans->block_rsv = NULL; + trans->bytes_reserved = 0; + ret = btrfs_end_transaction(trans); + inode->i_flags |= S_DEAD; +out_release: + btrfs_subvolume_release_metadata(root, &block_rsv); +out_undead: + if (ret) { + spin_lock(&dest->root_item_lock); + root_flags = btrfs_root_flags(&dest->root_item); + btrfs_set_root_flags(&dest->root_item, + root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + } +out_up_write: + up_write(&fs_info->subvol_sem); + if (!ret) { + d_invalidate(dentry); + btrfs_prune_dentries(dest); + ASSERT(dest->send_in_progress == 0); + } + + return ret; +} + +static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + int err = 0; + struct btrfs_trans_handle *trans; + u64 last_unlink_trans; + struct fscrypt_name fname; + + if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) + return -ENOTEMPTY; + if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) { + if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) { + btrfs_err(fs_info, + "extent tree v2 doesn't support snapshot deletion yet"); + return -EOPNOTSUPP; + } + return btrfs_delete_subvolume(BTRFS_I(dir), dentry); + } + + err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (err) + return err; + + /* This needs to handle no-key deletions later on */ + + trans = __unlink_start_trans(BTRFS_I(dir)); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_notrans; + } + + if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); + goto out; + } + + err = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (err) + goto out; + + last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; + + /* now the directory is empty */ + err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), + &fname.disk_name); + if (!err) { + btrfs_i_size_write(BTRFS_I(inode), 0); + /* + * Propagate the last_unlink_trans value of the deleted dir to + * its parent directory. This is to prevent an unrecoverable + * log tree in the case we do something like this: + * 1) create dir foo + * 2) create snapshot under dir foo + * 3) delete the snapshot + * 4) rmdir foo + * 5) mkdir foo + * 6) fsync foo or some file inside foo + */ + if (last_unlink_trans >= trans->transid) + BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; + } +out: + btrfs_end_transaction(trans); +out_notrans: + btrfs_btree_balance_dirty(fs_info); + fscrypt_free_filename(&fname); + + return err; +} + +/* + * btrfs_truncate_block - read, zero a chunk and write a block + * @inode - inode that we're zeroing + * @from - the offset to start zeroing + * @len - the length to zero, 0 to zero the entire range respective to the + * offset + * @front - zero up to the offset instead of from the offset on + * + * This will find the block for the "from" offset and cow the block and zero the + * part we want to zero. This is used with truncate and hole punching. + */ +int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, + int front) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct address_space *mapping = inode->vfs_inode.i_mapping; + struct extent_io_tree *io_tree = &inode->io_tree; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; + bool only_release_metadata = false; + u32 blocksize = fs_info->sectorsize; + pgoff_t index = from >> PAGE_SHIFT; + unsigned offset = from & (blocksize - 1); + struct page *page; + gfp_t mask = btrfs_alloc_write_mask(mapping); + size_t write_bytes = blocksize; + int ret = 0; + u64 block_start; + u64 block_end; + + if (IS_ALIGNED(offset, blocksize) && + (!len || IS_ALIGNED(len, blocksize))) + goto out; + + block_start = round_down(from, blocksize); + block_end = block_start + blocksize - 1; + + ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, + blocksize, false); + if (ret < 0) { + if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) { + /* For nocow case, no need to reserve data space */ + only_release_metadata = true; + } else { + goto out; + } + } + ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false); + if (ret < 0) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(inode, data_reserved, + block_start, blocksize); + goto out; + } +again: + page = find_or_create_page(mapping, index, mask); + if (!page) { + btrfs_delalloc_release_space(inode, data_reserved, block_start, + blocksize, true); + btrfs_delalloc_release_extents(inode, blocksize); + ret = -ENOMEM; + goto out; + } + + if (!PageUptodate(page)) { + ret = btrfs_read_folio(NULL, page_folio(page)); + lock_page(page); + if (page->mapping != mapping) { + unlock_page(page); + put_page(page); + goto again; + } + if (!PageUptodate(page)) { + ret = -EIO; + goto out_unlock; + } + } + + /* + * We unlock the page after the io is completed and then re-lock it + * above. release_folio() could have come in between that and cleared + * PagePrivate(), but left the page in the mapping. Set the page mapped + * here to make sure it's properly set for the subpage stuff. + */ + ret = set_page_extent_mapped(page); + if (ret < 0) + goto out_unlock; + + wait_on_page_writeback(page); + + lock_extent(io_tree, block_start, block_end, &cached_state); + + ordered = btrfs_lookup_ordered_extent(inode, block_start); + if (ordered) { + unlock_extent(io_tree, block_start, block_end, &cached_state); + unlock_page(page); + put_page(page); + btrfs_start_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + goto again; + } + + clear_extent_bit(&inode->io_tree, block_start, block_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + &cached_state); + + ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, + &cached_state); + if (ret) { + unlock_extent(io_tree, block_start, block_end, &cached_state); + goto out_unlock; + } + + if (offset != blocksize) { + if (!len) + len = blocksize - offset; + if (front) + memzero_page(page, (block_start - page_offset(page)), + offset); + else + memzero_page(page, (block_start - page_offset(page)) + offset, + len); + } + btrfs_page_clear_checked(fs_info, page, block_start, + block_end + 1 - block_start); + btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); + unlock_extent(io_tree, block_start, block_end, &cached_state); + + if (only_release_metadata) + set_extent_bit(&inode->io_tree, block_start, block_end, + EXTENT_NORESERVE, NULL); + +out_unlock: + if (ret) { + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, blocksize, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + block_start, blocksize, true); + } + btrfs_delalloc_release_extents(inode, blocksize); + unlock_page(page); + put_page(page); +out: + if (only_release_metadata) + btrfs_check_nocow_unlock(inode); + extent_changeset_free(data_reserved); + return ret; +} + +static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, + u64 offset, u64 len) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_trans_handle *trans; + struct btrfs_drop_extents_args drop_args = { 0 }; + int ret; + + /* + * If NO_HOLES is enabled, we don't need to do anything. + * Later, up in the call chain, either btrfs_set_inode_last_sub_trans() + * or btrfs_update_inode() will be called, which guarantee that the next + * fsync will know this inode was changed and needs to be logged. + */ + if (btrfs_fs_incompat(fs_info, NO_HOLES)) + return 0; + + /* + * 1 - for the one we're dropping + * 1 - for the one we're adding + * 1 - for updating the inode. + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + drop_args.start = offset; + drop_args.end = offset + len; + drop_args.drop_cache = true; + + ret = btrfs_drop_extents(trans, root, inode, &drop_args); + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } + + ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len); + if (ret) { + btrfs_abort_transaction(trans, ret); + } else { + btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); + btrfs_update_inode(trans, root, inode); + } + btrfs_end_transaction(trans); + return ret; +} + +/* + * This function puts in dummy file extents for the area we're creating a hole + * for. So if we are truncating this file to a larger size we need to insert + * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for + * the range between oldsize and size + */ +int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct extent_map *em = NULL; + struct extent_state *cached_state = NULL; + u64 hole_start = ALIGN(oldsize, fs_info->sectorsize); + u64 block_end = ALIGN(size, fs_info->sectorsize); + u64 last_byte; + u64 cur_offset; + u64 hole_size; + int err = 0; + + /* + * If our size started in the middle of a block we need to zero out the + * rest of the block before we expand the i_size, otherwise we could + * expose stale data. + */ + err = btrfs_truncate_block(inode, oldsize, 0, 0); + if (err) + return err; + + if (size <= hole_start) + return 0; + + btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1, + &cached_state); + cur_offset = hole_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + block_end - cur_offset); + if (IS_ERR(em)) { + err = PTR_ERR(em); + em = NULL; + break; + } + last_byte = min(extent_map_end(em), block_end); + last_byte = ALIGN(last_byte, fs_info->sectorsize); + hole_size = last_byte - cur_offset; + + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + struct extent_map *hole_em; + + err = maybe_insert_hole(root, inode, cur_offset, + hole_size); + if (err) + break; + + err = btrfs_inode_set_file_extent_range(inode, + cur_offset, hole_size); + if (err) + break; + + hole_em = alloc_extent_map(); + if (!hole_em) { + btrfs_drop_extent_map_range(inode, cur_offset, + cur_offset + hole_size - 1, + false); + btrfs_set_inode_full_sync(inode); + goto next; + } + hole_em->start = cur_offset; + hole_em->len = hole_size; + hole_em->orig_start = cur_offset; + + hole_em->block_start = EXTENT_MAP_HOLE; + hole_em->block_len = 0; + hole_em->orig_block_len = 0; + hole_em->ram_bytes = hole_size; + hole_em->compress_type = BTRFS_COMPRESS_NONE; + hole_em->generation = fs_info->generation; + + err = btrfs_replace_extent_map_range(inode, hole_em, true); + free_extent_map(hole_em); + } else { + err = btrfs_inode_set_file_extent_range(inode, + cur_offset, hole_size); + if (err) + break; + } +next: + free_extent_map(em); + em = NULL; + cur_offset = last_byte; + if (cur_offset >= block_end) + break; + } + free_extent_map(em); + unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); + return err; +} + +static int btrfs_setsize(struct inode *inode, struct iattr *attr) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + loff_t oldsize = i_size_read(inode); + loff_t newsize = attr->ia_size; + int mask = attr->ia_valid; + int ret; + + /* + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a + * special case where we need to update the times despite not having + * these flags set. For all other operations the VFS set these flags + * explicitly if it wants a timestamp update. + */ + if (newsize != oldsize) { + inode_inc_iversion(inode); + if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { + inode->i_mtime = inode_set_ctime_current(inode); + } + } + + if (newsize > oldsize) { + /* + * Don't do an expanding truncate while snapshotting is ongoing. + * This is to ensure the snapshot captures a fully consistent + * state of this file - if the snapshot captures this expanding + * truncation, it must capture all writes that happened before + * this truncation. + */ + btrfs_drew_write_lock(&root->snapshot_lock); + ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize); + if (ret) { + btrfs_drew_write_unlock(&root->snapshot_lock); + return ret; + } + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + btrfs_drew_write_unlock(&root->snapshot_lock); + return PTR_ERR(trans); + } + + i_size_write(inode, newsize); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); + pagecache_isize_extended(inode, oldsize, newsize); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + btrfs_drew_write_unlock(&root->snapshot_lock); + btrfs_end_transaction(trans); + } else { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + + if (btrfs_is_zoned(fs_info)) { + ret = btrfs_wait_ordered_range(inode, + ALIGN(newsize, fs_info->sectorsize), + (u64)-1); + if (ret) + return ret; + } + + /* + * We're truncating a file that used to have good data down to + * zero. Make sure any new writes to the file get on disk + * on close. + */ + if (newsize == 0) + set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, + &BTRFS_I(inode)->runtime_flags); + + truncate_setsize(inode, newsize); + + inode_dio_wait(inode); + + ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize); + if (ret && inode->i_nlink) { + int err; + + /* + * Truncate failed, so fix up the in-memory size. We + * adjusted disk_i_size down as we removed extents, so + * wait for disk_i_size to be stable and then update the + * in-memory size to match. + */ + err = btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (err) + return err; + i_size_write(inode, BTRFS_I(inode)->disk_i_size); + } + } + + return ret; +} + +static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + struct btrfs_root *root = BTRFS_I(inode)->root; + int err; + + if (btrfs_root_readonly(root)) + return -EROFS; + + err = setattr_prepare(idmap, dentry, attr); + if (err) + return err; + + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { + err = btrfs_setsize(inode, attr); + if (err) + return err; + } + + if (attr->ia_valid) { + setattr_copy(idmap, inode, attr); + inode_inc_iversion(inode); + err = btrfs_dirty_inode(BTRFS_I(inode)); + + if (!err && attr->ia_valid & ATTR_MODE) + err = posix_acl_chmod(idmap, dentry, inode->i_mode); + } + + return err; +} + +/* + * While truncating the inode pages during eviction, we get the VFS + * calling btrfs_invalidate_folio() against each folio of the inode. This + * is slow because the calls to btrfs_invalidate_folio() result in a + * huge amount of calls to lock_extent() and clear_extent_bit(), + * which keep merging and splitting extent_state structures over and over, + * wasting lots of time. + * + * Therefore if the inode is being evicted, let btrfs_invalidate_folio() + * skip all those expensive operations on a per folio basis and do only + * the ordered io finishing, while we release here the extent_map and + * extent_state structures, without the excessive merging and splitting. + */ +static void evict_inode_truncate_pages(struct inode *inode) +{ + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct rb_node *node; + + ASSERT(inode->i_state & I_FREEING); + truncate_inode_pages_final(&inode->i_data); + + btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); + + /* + * Keep looping until we have no more ranges in the io tree. + * We can have ongoing bios started by readahead that have + * their endio callback (extent_io.c:end_bio_extent_readpage) + * still in progress (unlocked the pages in the bio but did not yet + * unlocked the ranges in the io tree). Therefore this means some + * ranges can still be locked and eviction started because before + * submitting those bios, which are executed by a separate task (work + * queue kthread), inode references (inode->i_count) were not taken + * (which would be dropped in the end io callback of each bio). + * Therefore here we effectively end up waiting for those bios and + * anyone else holding locked ranges without having bumped the inode's + * reference count - if we don't do it, when they access the inode's + * io_tree to unlock a range it may be too late, leading to an + * use-after-free issue. + */ + spin_lock(&io_tree->lock); + while (!RB_EMPTY_ROOT(&io_tree->state)) { + struct extent_state *state; + struct extent_state *cached_state = NULL; + u64 start; + u64 end; + unsigned state_flags; + + node = rb_first(&io_tree->state); + state = rb_entry(node, struct extent_state, rb_node); + start = state->start; + end = state->end; + state_flags = state->state; + spin_unlock(&io_tree->lock); + + lock_extent(io_tree, start, end, &cached_state); + + /* + * If still has DELALLOC flag, the extent didn't reach disk, + * and its reserved space won't be freed by delayed_ref. + * So we need to free its reserved space here. + * (Refer to comment in btrfs_invalidate_folio, case 2) + * + * Note, end is the bytenr of last byte, so we need + 1 here. + */ + if (state_flags & EXTENT_DELALLOC) + btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, + end - start + 1, NULL); + + clear_extent_bit(io_tree, start, end, + EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, + &cached_state); + + cond_resched(); + spin_lock(&io_tree->lock); + } + spin_unlock(&io_tree->lock); +} + +static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, + struct btrfs_block_rsv *rsv) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_trans_handle *trans; + u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1); + int ret; + + /* + * Eviction should be taking place at some place safe because of our + * delayed iputs. However the normal flushing code will run delayed + * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock. + * + * We reserve the delayed_refs_extra here again because we can't use + * btrfs_start_transaction(root, 0) for the same deadlocky reason as + * above. We reserve our extra bit here because we generate a ton of + * delayed refs activity by truncating. + * + * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can, + * if we fail to make this reservation we can re-try without the + * delayed_refs_extra so we can make some forward progress. + */ + ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra, + BTRFS_RESERVE_FLUSH_EVICT); + if (ret) { + ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size, + BTRFS_RESERVE_FLUSH_EVICT); + if (ret) { + btrfs_warn(fs_info, + "could not allocate space for delete; will truncate on mount"); + return ERR_PTR(-ENOSPC); + } + delayed_refs_extra = 0; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return trans; + + if (delayed_refs_extra) { + trans->block_rsv = &fs_info->trans_block_rsv; + trans->bytes_reserved = delayed_refs_extra; + btrfs_block_rsv_migrate(rsv, trans->block_rsv, + delayed_refs_extra, true); + } + return trans; +} + +void btrfs_evict_inode(struct inode *inode) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *rsv = NULL; + int ret; + + trace_btrfs_inode_evict(inode); + + if (!root) { + fsverity_cleanup_inode(inode); + clear_inode(inode); + return; + } + + evict_inode_truncate_pages(inode); + + if (inode->i_nlink && + ((btrfs_root_refs(&root->root_item) != 0 && + root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || + btrfs_is_free_space_inode(BTRFS_I(inode)))) + goto out; + + if (is_bad_inode(inode)) + goto out; + + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + goto out; + + if (inode->i_nlink > 0) { + BUG_ON(btrfs_root_refs(&root->root_item) != 0 && + root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); + goto out; + } + + /* + * This makes sure the inode item in tree is uptodate and the space for + * the inode update is released. + */ + ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); + if (ret) + goto out; + + /* + * This drops any pending insert or delete operations we have for this + * inode. We could have a delayed dir index deletion queued up, but + * we're removing the inode completely so that'll be taken care of in + * the truncate. + */ + btrfs_kill_delayed_inode_items(BTRFS_I(inode)); + + rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); + if (!rsv) + goto out; + rsv->size = btrfs_calc_metadata_size(fs_info, 1); + rsv->failfast = true; + + btrfs_i_size_write(BTRFS_I(inode), 0); + + while (1) { + struct btrfs_truncate_control control = { + .inode = BTRFS_I(inode), + .ino = btrfs_ino(BTRFS_I(inode)), + .new_size = 0, + .min_type = 0, + }; + + trans = evict_refill_and_join(root, rsv); + if (IS_ERR(trans)) + goto out; + + trans->block_rsv = rsv; + + ret = btrfs_truncate_inode_items(trans, root, &control); + trans->block_rsv = &fs_info->trans_block_rsv; + btrfs_end_transaction(trans); + /* + * We have not added new delayed items for our inode after we + * have flushed its delayed items, so no need to throttle on + * delayed items. However we have modified extent buffers. + */ + btrfs_btree_balance_dirty_nodelay(fs_info); + if (ret && ret != -ENOSPC && ret != -EAGAIN) + goto out; + else if (!ret) + break; + } + + /* + * Errors here aren't a big deal, it just means we leave orphan items in + * the tree. They will be cleaned up on the next mount. If the inode + * number gets reused, cleanup deletes the orphan item without doing + * anything, and unlink reuses the existing orphan item. + * + * If it turns out that we are dropping too many of these, we might want + * to add a mechanism for retrying these after a commit. + */ + trans = evict_refill_and_join(root, rsv); + if (!IS_ERR(trans)) { + trans->block_rsv = rsv; + btrfs_orphan_del(trans, BTRFS_I(inode)); + trans->block_rsv = &fs_info->trans_block_rsv; + btrfs_end_transaction(trans); + } + +out: + btrfs_free_block_rsv(fs_info, rsv); + /* + * If we didn't successfully delete, the orphan item will still be in + * the tree and we'll retry on the next mount. Again, we might also want + * to retry these periodically in the future. + */ + btrfs_remove_delayed_node(BTRFS_I(inode)); + fsverity_cleanup_inode(inode); + clear_inode(inode); +} + +/* + * Return the key found in the dir entry in the location pointer, fill @type + * with BTRFS_FT_*, and return 0. + * + * If no dir entries were found, returns -ENOENT. + * If found a corrupted location in dir entry, returns -EUCLEAN. + */ +static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, + struct btrfs_key *location, u8 *type) +{ + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_root *root = dir->root; + int ret = 0; + struct fscrypt_name fname; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); + if (ret < 0) + goto out; + /* + * fscrypt_setup_filename() should never return a positive value, but + * gcc on sparc/parisc thinks it can, so assert that doesn't happen. + */ + ASSERT(ret == 0); + + /* This needs to handle no-key deletions later on */ + + di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), + &fname.disk_name, 0); + if (IS_ERR_OR_NULL(di)) { + ret = di ? PTR_ERR(di) : -ENOENT; + goto out; + } + + btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); + if (location->type != BTRFS_INODE_ITEM_KEY && + location->type != BTRFS_ROOT_ITEM_KEY) { + ret = -EUCLEAN; + btrfs_warn(root->fs_info, +"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", + __func__, fname.disk_name.name, btrfs_ino(dir), + location->objectid, location->type, location->offset); + } + if (!ret) + *type = btrfs_dir_ftype(path->nodes[0], di); +out: + fscrypt_free_filename(&fname); + btrfs_free_path(path); + return ret; +} + +/* + * when we hit a tree root in a directory, the btrfs part of the inode + * needs to be changed to reflect the root directory of the tree root. This + * is kind of like crossing a mount point. + */ +static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, + struct btrfs_inode *dir, + struct dentry *dentry, + struct btrfs_key *location, + struct btrfs_root **sub_root) +{ + struct btrfs_path *path; + struct btrfs_root *new_root; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret; + int err = 0; + struct fscrypt_name fname; + + ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname); + if (ret) + return ret; + + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } + + err = -ENOENT; + key.objectid = dir->root->root_key.objectid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = location->objectid; + + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); + if (ret) { + if (ret < 0) + err = ret; + goto out; + } + + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); + if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || + btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len) + goto out; + + ret = memcmp_extent_buffer(leaf, fname.disk_name.name, + (unsigned long)(ref + 1), fname.disk_name.len); + if (ret) + goto out; + + btrfs_release_path(path); + + new_root = btrfs_get_fs_root(fs_info, location->objectid, true); + if (IS_ERR(new_root)) { + err = PTR_ERR(new_root); + goto out; + } + + *sub_root = new_root; + location->objectid = btrfs_root_dirid(&new_root->root_item); + location->type = BTRFS_INODE_ITEM_KEY; + location->offset = 0; + err = 0; +out: + btrfs_free_path(path); + fscrypt_free_filename(&fname); + return err; +} + +static void inode_tree_add(struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + struct btrfs_inode *entry; + struct rb_node **p; + struct rb_node *parent; + struct rb_node *new = &inode->rb_node; + u64 ino = btrfs_ino(inode); + + if (inode_unhashed(&inode->vfs_inode)) + return; + parent = NULL; + spin_lock(&root->inode_lock); + p = &root->inode_tree.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_inode, rb_node); + + if (ino < btrfs_ino(entry)) + p = &parent->rb_left; + else if (ino > btrfs_ino(entry)) + p = &parent->rb_right; + else { + WARN_ON(!(entry->vfs_inode.i_state & + (I_WILL_FREE | I_FREEING))); + rb_replace_node(parent, new, &root->inode_tree); + RB_CLEAR_NODE(parent); + spin_unlock(&root->inode_lock); + return; + } + } + rb_link_node(new, parent, p); + rb_insert_color(new, &root->inode_tree); + spin_unlock(&root->inode_lock); +} + +static void inode_tree_del(struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + int empty = 0; + + spin_lock(&root->inode_lock); + if (!RB_EMPTY_NODE(&inode->rb_node)) { + rb_erase(&inode->rb_node, &root->inode_tree); + RB_CLEAR_NODE(&inode->rb_node); + empty = RB_EMPTY_ROOT(&root->inode_tree); + } + spin_unlock(&root->inode_lock); + + if (empty && btrfs_root_refs(&root->root_item) == 0) { + spin_lock(&root->inode_lock); + empty = RB_EMPTY_ROOT(&root->inode_tree); + spin_unlock(&root->inode_lock); + if (empty) + btrfs_add_dead_root(root); + } +} + + +static int btrfs_init_locked_inode(struct inode *inode, void *p) +{ + struct btrfs_iget_args *args = p; + + inode->i_ino = args->ino; + BTRFS_I(inode)->location.objectid = args->ino; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + BTRFS_I(inode)->root = btrfs_grab_root(args->root); + BUG_ON(args->root && !BTRFS_I(inode)->root); + + if (args->root && args->root == args->root->fs_info->tree_root && + args->ino != BTRFS_BTREE_INODE_OBJECTID) + set_bit(BTRFS_INODE_FREE_SPACE_INODE, + &BTRFS_I(inode)->runtime_flags); + return 0; +} + +static int btrfs_find_actor(struct inode *inode, void *opaque) +{ + struct btrfs_iget_args *args = opaque; + + return args->ino == BTRFS_I(inode)->location.objectid && + args->root == BTRFS_I(inode)->root; +} + +static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, + struct btrfs_root *root) +{ + struct inode *inode; + struct btrfs_iget_args args; + unsigned long hashval = btrfs_inode_hash(ino, root); + + args.ino = ino; + args.root = root; + + inode = iget5_locked(s, hashval, btrfs_find_actor, + btrfs_init_locked_inode, + (void *)&args); + return inode; +} + +/* + * Get an inode object given its inode number and corresponding root. + * Path can be preallocated to prevent recursing back to iget through + * allocator. NULL is also valid but may require an additional allocation + * later. + */ +struct inode *btrfs_iget_path(struct super_block *s, u64 ino, + struct btrfs_root *root, struct btrfs_path *path) +{ + struct inode *inode; + + inode = btrfs_iget_locked(s, ino, root); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + int ret; + + ret = btrfs_read_locked_inode(inode, path); + if (!ret) { + inode_tree_add(BTRFS_I(inode)); + unlock_new_inode(inode); + } else { + iget_failed(inode); + /* + * ret > 0 can come from btrfs_search_slot called by + * btrfs_read_locked_inode, this means the inode item + * was not found. + */ + if (ret > 0) + ret = -ENOENT; + inode = ERR_PTR(ret); + } + } + + return inode; +} + +struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root) +{ + return btrfs_iget_path(s, ino, root, NULL); +} + +static struct inode *new_simple_dir(struct inode *dir, + struct btrfs_key *key, + struct btrfs_root *root) +{ + struct inode *inode = new_inode(dir->i_sb); + + if (!inode) + return ERR_PTR(-ENOMEM); + + BTRFS_I(inode)->root = btrfs_grab_root(root); + memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); + set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); + + inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; + /* + * We only need lookup, the rest is read-only and there's no inode + * associated with the dentry + */ + inode->i_op = &simple_dir_inode_operations; + inode->i_opflags &= ~IOP_XATTR; + inode->i_fop = &simple_dir_operations; + inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; + inode->i_mtime = inode_set_ctime_current(inode); + inode->i_atime = dir->i_atime; + BTRFS_I(inode)->i_otime = inode->i_mtime; + inode->i_uid = dir->i_uid; + inode->i_gid = dir->i_gid; + + return inode; +} + +static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN); +static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE); +static_assert(BTRFS_FT_DIR == FT_DIR); +static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV); +static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV); +static_assert(BTRFS_FT_FIFO == FT_FIFO); +static_assert(BTRFS_FT_SOCK == FT_SOCK); +static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); + +static inline u8 btrfs_inode_type(struct inode *inode) +{ + return fs_umode_to_ftype(inode->i_mode); +} + +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct inode *inode; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *sub_root = root; + struct btrfs_key location; + u8 di_type = 0; + int ret = 0; + + if (dentry->d_name.len > BTRFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type); + if (ret < 0) + return ERR_PTR(ret); + + if (location.type == BTRFS_INODE_ITEM_KEY) { + inode = btrfs_iget(dir->i_sb, location.objectid, root); + if (IS_ERR(inode)) + return inode; + + /* Do extra check against inode mode with di_type */ + if (btrfs_inode_type(inode) != di_type) { + btrfs_crit(fs_info, +"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", + inode->i_mode, btrfs_inode_type(inode), + di_type); + iput(inode); + return ERR_PTR(-EUCLEAN); + } + return inode; + } + + ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry, + &location, &sub_root); + if (ret < 0) { + if (ret != -ENOENT) + inode = ERR_PTR(ret); + else + inode = new_simple_dir(dir, &location, root); + } else { + inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); + btrfs_put_root(sub_root); + + if (IS_ERR(inode)) + return inode; + + down_read(&fs_info->cleanup_work_sem); + if (!sb_rdonly(inode->i_sb)) + ret = btrfs_orphan_cleanup(sub_root); + up_read(&fs_info->cleanup_work_sem); + if (ret) { + iput(inode); + inode = ERR_PTR(ret); + } + } + + return inode; +} + +static int btrfs_dentry_delete(const struct dentry *dentry) +{ + struct btrfs_root *root; + struct inode *inode = d_inode(dentry); + + if (!inode && !IS_ROOT(dentry)) + inode = d_inode(dentry->d_parent); + + if (inode) { + root = BTRFS_I(inode)->root; + if (btrfs_root_refs(&root->root_item) == 0) + return 1; + + if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return 1; + } + return 0; +} + +static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct inode *inode = btrfs_lookup_dentry(dir, dentry); + + if (inode == ERR_PTR(-ENOENT)) + inode = NULL; + return d_splice_alias(inode, dentry); +} + +/* + * Find the highest existing sequence number in a directory and then set the + * in-memory index_cnt variable to the first free sequence number. + */ +static int btrfs_set_inode_index_count(struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + struct btrfs_key key, found_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + /* FIXME: we should be able to handle this */ + if (ret == 0) + goto out; + ret = 0; + + if (path->slots[0] == 0) { + inode->index_cnt = BTRFS_DIR_START_INDEX; + goto out; + } + + path->slots[0]--; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != btrfs_ino(inode) || + found_key.type != BTRFS_DIR_INDEX_KEY) { + inode->index_cnt = BTRFS_DIR_START_INDEX; + goto out; + } + + inode->index_cnt = found_key.offset + 1; +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) +{ + int ret = 0; + + btrfs_inode_lock(dir, 0); + if (dir->index_cnt == (u64)-1) { + ret = btrfs_inode_delayed_dir_index_count(dir); + if (ret) { + ret = btrfs_set_inode_index_count(dir); + if (ret) + goto out; + } + } + + /* index_cnt is the index number of next new entry, so decrement it. */ + *index = dir->index_cnt - 1; +out: + btrfs_inode_unlock(dir, 0); + + return ret; +} + +/* + * All this infrastructure exists because dir_emit can fault, and we are holding + * the tree lock when doing readdir. For now just allocate a buffer and copy + * our information into that, and then dir_emit from the buffer. This is + * similar to what NFS does, only we don't keep the buffer around in pagecache + * because I'm afraid I'll mess that up. Long term we need to make filldir do + * copy_to_user_inatomic so we don't have to worry about page faulting under the + * tree lock. + */ +static int btrfs_opendir(struct inode *inode, struct file *file) +{ + struct btrfs_file_private *private; + u64 last_index; + int ret; + + ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index); + if (ret) + return ret; + + private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL); + if (!private) + return -ENOMEM; + private->last_index = last_index; + private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!private->filldir_buf) { + kfree(private); + return -ENOMEM; + } + file->private_data = private; + return 0; +} + +static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct btrfs_file_private *private = file->private_data; + int ret; + + ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)), + &private->last_index); + if (ret) + return ret; + + return generic_file_llseek(file, offset, whence); +} + +struct dir_entry { + u64 ino; + u64 offset; + unsigned type; + int name_len; +}; + +static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) +{ + while (entries--) { + struct dir_entry *entry = addr; + char *name = (char *)(entry + 1); + + ctx->pos = get_unaligned(&entry->offset); + if (!dir_emit(ctx, name, get_unaligned(&entry->name_len), + get_unaligned(&entry->ino), + get_unaligned(&entry->type))) + return 1; + addr += sizeof(struct dir_entry) + + get_unaligned(&entry->name_len); + ctx->pos++; + } + return 0; +} + +static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_private *private = file->private_data; + struct btrfs_dir_item *di; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + void *addr; + LIST_HEAD(ins_list); + LIST_HEAD(del_list); + int ret; + char *name_ptr; + int name_len; + int entries = 0; + int total_len = 0; + bool put = false; + struct btrfs_key location; + + if (!dir_emit_dots(file, ctx)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + addr = private->filldir_buf; + path->reada = READA_FORWARD; + + put = btrfs_readdir_get_delayed_items(inode, private->last_index, + &ins_list, &del_list); + +again: + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = ctx->pos; + key.objectid = btrfs_ino(BTRFS_I(inode)); + + btrfs_for_each_slot(root, &key, &found_key, path, ret) { + struct dir_entry *entry; + struct extent_buffer *leaf = path->nodes[0]; + u8 ftype; + + if (found_key.objectid != key.objectid) + break; + if (found_key.type != BTRFS_DIR_INDEX_KEY) + break; + if (found_key.offset < ctx->pos) + continue; + if (found_key.offset > private->last_index) + break; + if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) + continue; + di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); + name_len = btrfs_dir_name_len(leaf, di); + if ((total_len + sizeof(struct dir_entry) + name_len) >= + PAGE_SIZE) { + btrfs_release_path(path); + ret = btrfs_filldir(private->filldir_buf, entries, ctx); + if (ret) + goto nopos; + addr = private->filldir_buf; + entries = 0; + total_len = 0; + goto again; + } + + ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di)); + entry = addr; + name_ptr = (char *)(entry + 1); + read_extent_buffer(leaf, name_ptr, + (unsigned long)(di + 1), name_len); + put_unaligned(name_len, &entry->name_len); + put_unaligned(fs_ftype_to_dtype(ftype), &entry->type); + btrfs_dir_item_key_to_cpu(leaf, di, &location); + put_unaligned(location.objectid, &entry->ino); + put_unaligned(found_key.offset, &entry->offset); + entries++; + addr += sizeof(struct dir_entry) + name_len; + total_len += sizeof(struct dir_entry) + name_len; + } + /* Catch error encountered during iteration */ + if (ret < 0) + goto err; + + btrfs_release_path(path); + + ret = btrfs_filldir(private->filldir_buf, entries, ctx); + if (ret) + goto nopos; + + ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); + if (ret) + goto nopos; + + /* + * Stop new entries from being returned after we return the last + * entry. + * + * New directory entries are assigned a strictly increasing + * offset. This means that new entries created during readdir + * are *guaranteed* to be seen in the future by that readdir. + * This has broken buggy programs which operate on names as + * they're returned by readdir. Until we re-use freed offsets + * we have this hack to stop new entries from being returned + * under the assumption that they'll never reach this huge + * offset. + * + * This is being careful not to overflow 32bit loff_t unless the + * last entry requires it because doing so has broken 32bit apps + * in the past. + */ + if (ctx->pos >= INT_MAX) + ctx->pos = LLONG_MAX; + else + ctx->pos = INT_MAX; +nopos: + ret = 0; +err: + if (put) + btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); + btrfs_free_path(path); + return ret; +} + +/* + * This is somewhat expensive, updating the tree every time the + * inode changes. But, it is most likely to find the inode in cache. + * FIXME, needs more benchmarking...there are no reasons other than performance + * to keep or drop this code. + */ +static int btrfs_dirty_inode(struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_trans_handle *trans; + int ret; + + if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags)) + return 0; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_update_inode(trans, root, inode); + if (ret && (ret == -ENOSPC || ret == -EDQUOT)) { + /* whoops, lets try again with the full transaction */ + btrfs_end_transaction(trans); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_update_inode(trans, root, inode); + } + btrfs_end_transaction(trans); + if (inode->delayed_node) + btrfs_balance_delayed_items(fs_info); + + return ret; +} + +/* + * This is a copy of file_update_time. We need this so we can return error on + * ENOSPC for updating the inode in the case of file write and mmap writes. + */ +static int btrfs_update_time(struct inode *inode, int flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + bool dirty = flags & ~S_VERSION; + + if (btrfs_root_readonly(root)) + return -EROFS; + + dirty = inode_update_timestamps(inode, flags); + return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0; +} + +/* + * helper to find a free sequence number in a given directory. This current + * code is very simple, later versions will do smarter things in the btree + */ +int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index) +{ + int ret = 0; + + if (dir->index_cnt == (u64)-1) { + ret = btrfs_inode_delayed_dir_index_count(dir); + if (ret) { + ret = btrfs_set_inode_index_count(dir); + if (ret) + return ret; + } + } + + *index = dir->index_cnt; + dir->index_cnt++; + + return ret; +} + +static int btrfs_insert_inode_locked(struct inode *inode) +{ + struct btrfs_iget_args args; + + args.ino = BTRFS_I(inode)->location.objectid; + args.root = BTRFS_I(inode)->root; + + return insert_inode_locked4(inode, + btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), + btrfs_find_actor, &args); +} + +int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, + unsigned int *trans_num_items) +{ + struct inode *dir = args->dir; + struct inode *inode = args->inode; + int ret; + + if (!args->orphan) { + ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0, + &args->fname); + if (ret) + return ret; + } + + ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl); + if (ret) { + fscrypt_free_filename(&args->fname); + return ret; + } + + /* 1 to add inode item */ + *trans_num_items = 1; + /* 1 to add compression property */ + if (BTRFS_I(dir)->prop_compress) + (*trans_num_items)++; + /* 1 to add default ACL xattr */ + if (args->default_acl) + (*trans_num_items)++; + /* 1 to add access ACL xattr */ + if (args->acl) + (*trans_num_items)++; +#ifdef CONFIG_SECURITY + /* 1 to add LSM xattr */ + if (dir->i_security) + (*trans_num_items)++; +#endif + if (args->orphan) { + /* 1 to add orphan item */ + (*trans_num_items)++; + } else { + /* + * 1 to add dir item + * 1 to add dir index + * 1 to update parent inode item + * + * No need for 1 unit for the inode ref item because it is + * inserted in a batch together with the inode item at + * btrfs_create_new_inode(). + */ + *trans_num_items += 3; + } + return 0; +} + +void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) +{ + posix_acl_release(args->acl); + posix_acl_release(args->default_acl); + fscrypt_free_filename(&args->fname); +} + +/* + * Inherit flags from the parent inode. + * + * Currently only the compression flags and the cow flags are inherited. + */ +static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir) +{ + unsigned int flags; + + flags = dir->flags; + + if (flags & BTRFS_INODE_NOCOMPRESS) { + inode->flags &= ~BTRFS_INODE_COMPRESS; + inode->flags |= BTRFS_INODE_NOCOMPRESS; + } else if (flags & BTRFS_INODE_COMPRESS) { + inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + inode->flags |= BTRFS_INODE_COMPRESS; + } + + if (flags & BTRFS_INODE_NODATACOW) { + inode->flags |= BTRFS_INODE_NODATACOW; + if (S_ISREG(inode->vfs_inode.i_mode)) + inode->flags |= BTRFS_INODE_NODATASUM; + } + + btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); +} + +int btrfs_create_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_new_inode_args *args) +{ + struct inode *dir = args->dir; + struct inode *inode = args->inode; + const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name; + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_root *root; + struct btrfs_inode_item *inode_item; + struct btrfs_key *location; + struct btrfs_path *path; + u64 objectid; + struct btrfs_inode_ref *ref; + struct btrfs_key key[2]; + u32 sizes[2]; + struct btrfs_item_batch batch; + unsigned long ptr; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (!args->subvol) + BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root); + root = BTRFS_I(inode)->root; + + ret = btrfs_get_free_objectid(root, &objectid); + if (ret) + goto out; + inode->i_ino = objectid; + + if (args->orphan) { + /* + * O_TMPFILE, set link count to 0, so that after this point, we + * fill in an inode item with the correct link count. + */ + set_nlink(inode, 0); + } else { + trace_btrfs_inode_request(dir); + + ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index); + if (ret) + goto out; + } + /* index_cnt is ignored for everything but a dir. */ + BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; + BTRFS_I(inode)->generation = trans->transid; + inode->i_generation = BTRFS_I(inode)->generation; + + /* + * Subvolumes don't inherit flags from their parent directory. + * Originally this was probably by accident, but we probably can't + * change it now without compatibility issues. + */ + if (!args->subvol) + btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir)); + + if (S_ISREG(inode->i_mode)) { + if (btrfs_test_opt(fs_info, NODATASUM)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + if (btrfs_test_opt(fs_info, NODATACOW)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; + } + + location = &BTRFS_I(inode)->location; + location->objectid = objectid; + location->offset = 0; + location->type = BTRFS_INODE_ITEM_KEY; + + ret = btrfs_insert_inode_locked(inode); + if (ret < 0) { + if (!args->orphan) + BTRFS_I(dir)->index_cnt--; + goto out; + } + + /* + * We could have gotten an inode number from somebody who was fsynced + * and then removed in this same transaction, so let's just set full + * sync since it will be a full sync anyway and this will blow away the + * old info in the log. + */ + btrfs_set_inode_full_sync(BTRFS_I(inode)); + + key[0].objectid = objectid; + key[0].type = BTRFS_INODE_ITEM_KEY; + key[0].offset = 0; + + sizes[0] = sizeof(struct btrfs_inode_item); + + if (!args->orphan) { + /* + * Start new inodes with an inode_ref. This is slightly more + * efficient for small numbers of hard links since they will + * be packed into one item. Extended refs will kick in if we + * add more hard links than can fit in the ref item. + */ + key[1].objectid = objectid; + key[1].type = BTRFS_INODE_REF_KEY; + if (args->subvol) { + key[1].offset = objectid; + sizes[1] = 2 + sizeof(*ref); + } else { + key[1].offset = btrfs_ino(BTRFS_I(dir)); + sizes[1] = name->len + sizeof(*ref); + } + } + + batch.keys = &key[0]; + batch.data_sizes = &sizes[0]; + batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]); + batch.nr = args->orphan ? 1 : 2; + ret = btrfs_insert_empty_items(trans, root, path, &batch); + if (ret != 0) { + btrfs_abort_transaction(trans, ret); + goto discard; + } + + inode->i_mtime = inode_set_ctime_current(inode); + inode->i_atime = inode->i_mtime; + BTRFS_I(inode)->i_otime = inode->i_mtime; + + /* + * We're going to fill the inode item now, so at this point the inode + * must be fully initialized. + */ + + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item, + sizeof(*inode_item)); + fill_inode_item(trans, path->nodes[0], inode_item, inode); + + if (!args->orphan) { + ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, + struct btrfs_inode_ref); + ptr = (unsigned long)(ref + 1); + if (args->subvol) { + btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2); + btrfs_set_inode_ref_index(path->nodes[0], ref, 0); + write_extent_buffer(path->nodes[0], "..", ptr, 2); + } else { + btrfs_set_inode_ref_name_len(path->nodes[0], ref, + name->len); + btrfs_set_inode_ref_index(path->nodes[0], ref, + BTRFS_I(inode)->dir_index); + write_extent_buffer(path->nodes[0], name->name, ptr, + name->len); + } + } + + btrfs_mark_buffer_dirty(trans, path->nodes[0]); + /* + * We don't need the path anymore, plus inheriting properties, adding + * ACLs, security xattrs, orphan item or adding the link, will result in + * allocating yet another path. So just free our path. + */ + btrfs_free_path(path); + path = NULL; + + if (args->subvol) { + struct inode *parent; + + /* + * Subvolumes inherit properties from their parent subvolume, + * not the directory they were created in. + */ + parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, + BTRFS_I(dir)->root); + if (IS_ERR(parent)) { + ret = PTR_ERR(parent); + } else { + ret = btrfs_inode_inherit_props(trans, inode, parent); + iput(parent); + } + } else { + ret = btrfs_inode_inherit_props(trans, inode, dir); + } + if (ret) { + btrfs_err(fs_info, + "error inheriting props for ino %llu (root %llu): %d", + btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, + ret); + } + + /* + * Subvolumes don't inherit ACLs or get passed to the LSM. This is + * probably a bug. + */ + if (!args->subvol) { + ret = btrfs_init_inode_security(trans, args); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } + } + + inode_tree_add(BTRFS_I(inode)); + + trace_btrfs_inode_new(inode); + btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); + + btrfs_update_root_times(trans, root); + + if (args->orphan) { + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + } else { + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, + 0, BTRFS_I(inode)->dir_index); + } + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } + + return 0; + +discard: + /* + * discard_new_inode() calls iput(), but the caller owns the reference + * to the inode. + */ + ihold(inode); + discard_new_inode(inode); +out: + btrfs_free_path(path); + return ret; +} + +/* + * utility function to add 'inode' into 'parent_inode' with + * a give name and a given sequence number. + * if 'add_backref' is true, also insert a backref from the + * inode to the parent directory. + */ +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct btrfs_inode *parent_inode, struct btrfs_inode *inode, + const struct fscrypt_str *name, int add_backref, u64 index) +{ + int ret = 0; + struct btrfs_key key; + struct btrfs_root *root = parent_inode->root; + u64 ino = btrfs_ino(inode); + u64 parent_ino = btrfs_ino(parent_inode); + + if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { + memcpy(&key, &inode->root->root_key, sizeof(key)); + } else { + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + } + + if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { + ret = btrfs_add_root_ref(trans, key.objectid, + root->root_key.objectid, parent_ino, + index, name); + } else if (add_backref) { + ret = btrfs_insert_inode_ref(trans, root, name, + ino, parent_ino, index); + } + + /* Nothing to clean up yet */ + if (ret) + return ret; + + ret = btrfs_insert_dir_item(trans, name, parent_inode, &key, + btrfs_inode_type(&inode->vfs_inode), index); + if (ret == -EEXIST || ret == -EOVERFLOW) + goto fail_dir_item; + else if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + + name->len * 2); + inode_inc_iversion(&parent_inode->vfs_inode); + /* + * If we are replaying a log tree, we do not want to update the mtime + * and ctime of the parent directory with the current time, since the + * log replay procedure is responsible for setting them to their correct + * values (the ones it had when the fsync was done). + */ + if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) + parent_inode->vfs_inode.i_mtime = + inode_set_ctime_current(&parent_inode->vfs_inode); + + ret = btrfs_update_inode(trans, root, parent_inode); + if (ret) + btrfs_abort_transaction(trans, ret); + return ret; + +fail_dir_item: + if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { + u64 local_index; + int err; + err = btrfs_del_root_ref(trans, key.objectid, + root->root_key.objectid, parent_ino, + &local_index, name); + if (err) + btrfs_abort_transaction(trans, err); + } else if (add_backref) { + u64 local_index; + int err; + + err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, + &local_index); + if (err) + btrfs_abort_transaction(trans, err); + } + + /* Return the original error code */ + return ret; +} + +static int btrfs_create_common(struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + .inode = inode, + }; + unsigned int trans_num_items; + struct btrfs_trans_handle *trans; + int err; + + err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); + if (err) + goto out_inode; + + trans = btrfs_start_transaction(root, trans_num_items); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_new_inode_args; + } + + err = btrfs_create_new_inode(trans, &new_inode_args); + if (!err) + d_instantiate_new(dentry, inode); + + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + if (err) + iput(inode); + return err; +} + +static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) +{ + struct inode *inode; + + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(idmap, inode, dir, mode); + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + return btrfs_create_common(dir, dentry, inode); +} + +static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) +{ + struct inode *inode; + + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(idmap, inode, dir, mode); + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + inode->i_mapping->a_ops = &btrfs_aops; + return btrfs_create_common(dir, dentry, inode); +} + +static int btrfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct btrfs_trans_handle *trans = NULL; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = d_inode(old_dentry); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct fscrypt_name fname; + u64 index; + int err; + int drop_inode = 0; + + /* do not allow sys_link's with other subvols of the same device */ + if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid) + return -EXDEV; + + if (inode->i_nlink >= BTRFS_LINK_MAX) + return -EMLINK; + + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); + if (err) + goto fail; + + err = btrfs_set_inode_index(BTRFS_I(dir), &index); + if (err) + goto fail; + + /* + * 2 items for inode and inode ref + * 2 items for dir items + * 1 item for parent inode + * 1 item for orphan item deletion if O_TMPFILE + */ + trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + trans = NULL; + goto fail; + } + + /* There are several dir indexes for this inode, clear the cache. */ + BTRFS_I(inode)->dir_index = 0ULL; + inc_nlink(inode); + inode_inc_iversion(inode); + inode_set_ctime_current(inode); + ihold(inode); + set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); + + err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + &fname.disk_name, 1, index); + + if (err) { + drop_inode = 1; + } else { + struct dentry *parent = dentry->d_parent; + + err = btrfs_update_inode(trans, root, BTRFS_I(inode)); + if (err) + goto fail; + if (inode->i_nlink == 1) { + /* + * If new hard link count is 1, it's a file created + * with open(2) O_TMPFILE flag. + */ + err = btrfs_orphan_del(trans, BTRFS_I(inode)); + if (err) + goto fail; + } + d_instantiate(dentry, inode); + btrfs_log_new_name(trans, old_dentry, NULL, 0, parent); + } + +fail: + fscrypt_free_filename(&fname); + if (trans) + btrfs_end_transaction(trans); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(fs_info); + return err; +} + +static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(idmap, inode, dir, S_IFDIR | mode); + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + return btrfs_create_common(dir, dentry, inode); +} + +static noinline int uncompress_inline(struct btrfs_path *path, + struct page *page, + struct btrfs_file_extent_item *item) +{ + int ret; + struct extent_buffer *leaf = path->nodes[0]; + char *tmp; + size_t max_size; + unsigned long inline_size; + unsigned long ptr; + int compress_type; + + compress_type = btrfs_file_extent_compression(leaf, item); + max_size = btrfs_file_extent_ram_bytes(leaf, item); + inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); + tmp = kmalloc(inline_size, GFP_NOFS); + if (!tmp) + return -ENOMEM; + ptr = btrfs_file_extent_inline_start(item); + + read_extent_buffer(leaf, tmp, ptr, inline_size); + + max_size = min_t(unsigned long, PAGE_SIZE, max_size); + ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size); + + /* + * decompression code contains a memset to fill in any space between the end + * of the uncompressed data and the end of max_size in case the decompressed + * data ends up shorter than ram_bytes. That doesn't cover the hole between + * the end of an inline extent and the beginning of the next block, so we + * cover that region here. + */ + + if (max_size < PAGE_SIZE) + memzero_page(page, max_size, PAGE_SIZE - max_size); + kfree(tmp); + return ret; +} + +static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, + struct page *page) +{ + struct btrfs_file_extent_item *fi; + void *kaddr; + size_t copy_size; + + if (!page || PageUptodate(page)) + return 0; + + ASSERT(page_offset(page) == 0); + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) + return uncompress_inline(path, page, fi); + + copy_size = min_t(u64, PAGE_SIZE, + btrfs_file_extent_ram_bytes(path->nodes[0], fi)); + kaddr = kmap_local_page(page); + read_extent_buffer(path->nodes[0], kaddr, + btrfs_file_extent_inline_start(fi), copy_size); + kunmap_local(kaddr); + if (copy_size < PAGE_SIZE) + memzero_page(page, copy_size, PAGE_SIZE - copy_size); + return 0; +} + +/* + * Lookup the first extent overlapping a range in a file. + * + * @inode: file to search in + * @page: page to read extent data into if the extent is inline + * @pg_offset: offset into @page to copy to + * @start: file offset + * @len: length of range starting at @start + * + * Return the first &struct extent_map which overlaps the given range, reading + * it from the B-tree and caching it if necessary. Note that there may be more + * extents which overlap the given range after the returned extent_map. + * + * If @page is not NULL and the extent is inline, this also reads the extent + * data directly into the page and marks the extent up to date in the io_tree. + * + * Return: ERR_PTR on error, non-NULL extent_map on success. + */ +struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, + struct page *page, size_t pg_offset, + u64 start, u64 len) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + int ret = 0; + u64 extent_start = 0; + u64 extent_end = 0; + u64 objectid = btrfs_ino(inode); + int extent_type = -1; + struct btrfs_path *path = NULL; + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *item; + struct extent_buffer *leaf; + struct btrfs_key found_key; + struct extent_map *em = NULL; + struct extent_map_tree *em_tree = &inode->extent_tree; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + read_unlock(&em_tree->lock); + + if (em) { + if (em->start > start || em->start + em->len <= start) + free_extent_map(em); + else if (em->block_start == EXTENT_MAP_INLINE && page) + free_extent_map(em); + else + goto out; + } + em = alloc_extent_map(); + if (!em) { + ret = -ENOMEM; + goto out; + } + em->start = EXTENT_MAP_HOLE; + em->orig_start = EXTENT_MAP_HOLE; + em->len = (u64)-1; + em->block_len = (u64)-1; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + /* Chances are we'll be called again, so go ahead and do readahead */ + path->reada = READA_FORWARD; + + /* + * The same explanation in load_free_space_cache applies here as well, + * we only read when we're loading the free space cache, and at that + * point the commit_root has everything we need. + */ + if (btrfs_is_free_space_inode(inode)) { + path->search_commit_root = 1; + path->skip_locking = 1; + } + + ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + if (path->slots[0] == 0) + goto not_found; + path->slots[0]--; + ret = 0; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != objectid || + found_key.type != BTRFS_EXTENT_DATA_KEY) { + /* + * If we backup past the first extent we want to move forward + * and see if there is an extent in front of us, otherwise we'll + * say there is a hole for our whole search range which can + * cause problems. + */ + extent_end = start; + goto next; + } + + extent_type = btrfs_file_extent_type(leaf, item); + extent_start = found_key.offset; + extent_end = btrfs_file_extent_end(path); + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + /* Only regular file could have regular/prealloc extent */ + if (!S_ISREG(inode->vfs_inode.i_mode)) { + ret = -EUCLEAN; + btrfs_crit(fs_info, + "regular/prealloc extent found for non-regular inode %llu", + btrfs_ino(inode)); + goto out; + } + trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, + extent_start); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, + path->slots[0], + extent_start); + } +next: + if (start >= extent_end) { + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + goto not_found; + + leaf = path->nodes[0]; + } + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != objectid || + found_key.type != BTRFS_EXTENT_DATA_KEY) + goto not_found; + if (start + len <= found_key.offset) + goto not_found; + if (start > found_key.offset) + goto next; + + /* New extent overlaps with existing one */ + em->start = start; + em->orig_start = start; + em->len = found_key.offset - start; + em->block_start = EXTENT_MAP_HOLE; + goto insert; + } + + btrfs_extent_item_to_extent_map(inode, path, item, em); + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + goto insert; + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + /* + * Inline extent can only exist at file offset 0. This is + * ensured by tree-checker and inline extent creation path. + * Thus all members representing file offsets should be zero. + */ + ASSERT(pg_offset == 0); + ASSERT(extent_start == 0); + ASSERT(em->start == 0); + + /* + * btrfs_extent_item_to_extent_map() should have properly + * initialized em members already. + * + * Other members are not utilized for inline extents. + */ + ASSERT(em->block_start == EXTENT_MAP_INLINE); + ASSERT(em->len == fs_info->sectorsize); + + ret = read_inline_extent(inode, path, page); + if (ret < 0) + goto out; + goto insert; + } +not_found: + em->start = start; + em->orig_start = start; + em->len = len; + em->block_start = EXTENT_MAP_HOLE; +insert: + ret = 0; + btrfs_release_path(path); + if (em->start > start || extent_map_end(em) <= start) { + btrfs_err(fs_info, + "bad extent! em: [%llu %llu] passed [%llu %llu]", + em->start, em->len, start, len); + ret = -EIO; + goto out; + } + + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + write_unlock(&em_tree->lock); +out: + btrfs_free_path(path); + + trace_btrfs_get_extent(root, inode, em); + + if (ret) { + free_extent_map(em); + return ERR_PTR(ret); + } + return em; +} + +static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, + struct btrfs_dio_data *dio_data, + const u64 start, + const u64 len, + const u64 orig_start, + const u64 block_start, + const u64 block_len, + const u64 orig_block_len, + const u64 ram_bytes, + const int type) +{ + struct extent_map *em = NULL; + struct btrfs_ordered_extent *ordered; + + if (type != BTRFS_ORDERED_NOCOW) { + em = create_io_em(inode, start, len, orig_start, block_start, + block_len, orig_block_len, ram_bytes, + BTRFS_COMPRESS_NONE, /* compress_type */ + type); + if (IS_ERR(em)) + goto out; + } + ordered = btrfs_alloc_ordered_extent(inode, start, len, len, + block_start, block_len, 0, + (1 << type) | + (1 << BTRFS_ORDERED_DIRECT), + BTRFS_COMPRESS_NONE); + if (IS_ERR(ordered)) { + if (em) { + free_extent_map(em); + btrfs_drop_extent_map_range(inode, start, + start + len - 1, false); + } + em = ERR_CAST(ordered); + } else { + ASSERT(!dio_data->ordered); + dio_data->ordered = ordered; + } + out: + + return em; +} + +static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, + struct btrfs_dio_data *dio_data, + u64 start, u64 len) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_map *em; + struct btrfs_key ins; + u64 alloc_hint; + int ret; + + alloc_hint = get_extent_allocation_hint(inode, start, len); +again: + ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, + 0, alloc_hint, &ins, 1, 1); + if (ret == -EAGAIN) { + ASSERT(btrfs_is_zoned(fs_info)); + wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); + goto again; + } + if (ret) + return ERR_PTR(ret); + + em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start, + ins.objectid, ins.offset, ins.offset, + ins.offset, BTRFS_ORDERED_REGULAR); + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + if (IS_ERR(em)) + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, + 1); + + return em; +} + +static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group *block_group; + bool readonly = false; + + block_group = btrfs_lookup_block_group(fs_info, bytenr); + if (!block_group || block_group->ro) + readonly = true; + if (block_group) + btrfs_put_block_group(block_group); + return readonly; +} + +/* + * Check if we can do nocow write into the range [@offset, @offset + @len) + * + * @offset: File offset + * @len: The length to write, will be updated to the nocow writeable + * range + * @orig_start: (optional) Return the original file offset of the file extent + * @orig_len: (optional) Return the original on-disk length of the file extent + * @ram_bytes: (optional) Return the ram_bytes of the file extent + * @strict: if true, omit optimizations that might force us into unnecessary + * cow. e.g., don't trust generation number. + * + * Return: + * >0 and update @len if we can do nocow write + * 0 if we can't do nocow write + * <0 if error happened + * + * NOTE: This only checks the file extents, caller is responsible to wait for + * any ordered extents. + */ +noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes, bool nowait, bool strict) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct can_nocow_file_extent_args nocow_args = { 0 }; + struct btrfs_path *path; + int ret; + struct extent_buffer *leaf; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + int found_type; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->nowait = nowait; + + ret = btrfs_lookup_file_extent(NULL, root, path, + btrfs_ino(BTRFS_I(inode)), offset, 0); + if (ret < 0) + goto out; + + if (ret == 1) { + if (path->slots[0] == 0) { + /* can't find the item, must cow */ + ret = 0; + goto out; + } + path->slots[0]--; + } + ret = 0; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != btrfs_ino(BTRFS_I(inode)) || + key.type != BTRFS_EXTENT_DATA_KEY) { + /* not our file or wrong item type, must cow */ + goto out; + } + + if (key.offset > offset) { + /* Wrong offset, must cow */ + goto out; + } + + if (btrfs_file_extent_end(path) <= offset) + goto out; + + fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, fi); + if (ram_bytes) + *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + + nocow_args.start = offset; + nocow_args.end = offset + *len - 1; + nocow_args.strict = strict; + nocow_args.free_path = true; + + ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); + /* can_nocow_file_extent() has freed the path. */ + path = NULL; + + if (ret != 1) { + /* Treat errors as not being able to NOCOW. */ + ret = 0; + goto out; + } + + ret = 0; + if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr)) + goto out; + + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 range_end; + + range_end = round_up(offset + nocow_args.num_bytes, + root->fs_info->sectorsize) - 1; + ret = test_range_bit(io_tree, offset, range_end, + EXTENT_DELALLOC, 0, NULL); + if (ret) { + ret = -EAGAIN; + goto out; + } + } + + if (orig_start) + *orig_start = key.offset - nocow_args.extent_offset; + if (orig_block_len) + *orig_block_len = nocow_args.disk_num_bytes; + + *len = nocow_args.num_bytes; + ret = 1; +out: + btrfs_free_path(path); + return ret; +} + +static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, + struct extent_state **cached_state, + unsigned int iomap_flags) +{ + const bool writing = (iomap_flags & IOMAP_WRITE); + const bool nowait = (iomap_flags & IOMAP_NOWAIT); + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + int ret = 0; + + while (1) { + if (nowait) { + if (!try_lock_extent(io_tree, lockstart, lockend, + cached_state)) + return -EAGAIN; + } else { + lock_extent(io_tree, lockstart, lockend, cached_state); + } + /* + * We're concerned with the entire range that we're going to be + * doing DIO to, so we need to make sure there's no ordered + * extents in this range. + */ + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, + lockend - lockstart + 1); + + /* + * We need to make sure there are no buffered pages in this + * range either, we could have raced between the invalidate in + * generic_file_direct_write and locking the extent. The + * invalidate needs to happen so that reads after a write do not + * get stale data. + */ + if (!ordered && + (!writing || !filemap_range_has_page(inode->i_mapping, + lockstart, lockend))) + break; + + unlock_extent(io_tree, lockstart, lockend, cached_state); + + if (ordered) { + if (nowait) { + btrfs_put_ordered_extent(ordered); + ret = -EAGAIN; + break; + } + /* + * If we are doing a DIO read and the ordered extent we + * found is for a buffered write, we can not wait for it + * to complete and retry, because if we do so we can + * deadlock with concurrent buffered writes on page + * locks. This happens only if our DIO read covers more + * than one extent map, if at this point has already + * created an ordered extent for a previous extent map + * and locked its range in the inode's io tree, and a + * concurrent write against that previous extent map's + * range and this range started (we unlock the ranges + * in the io tree only when the bios complete and + * buffered writes always lock pages before attempting + * to lock range in the io tree). + */ + if (writing || + test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) + btrfs_start_ordered_extent(ordered); + else + ret = nowait ? -EAGAIN : -ENOTBLK; + btrfs_put_ordered_extent(ordered); + } else { + /* + * We could trigger writeback for this range (and wait + * for it to complete) and then invalidate the pages for + * this range (through invalidate_inode_pages2_range()), + * but that can lead us to a deadlock with a concurrent + * call to readahead (a buffered read or a defrag call + * triggered a readahead) on a page lock due to an + * ordered dio extent we created before but did not have + * yet a corresponding bio submitted (whence it can not + * complete), which makes readahead wait for that + * ordered extent to complete while holding a lock on + * that page. + */ + ret = nowait ? -EAGAIN : -ENOTBLK; + } + + if (ret) + break; + + cond_resched(); + } + + return ret; +} + +/* The callers of this must take lock_extent() */ +static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, + u64 len, u64 orig_start, u64 block_start, + u64 block_len, u64 orig_block_len, + u64 ram_bytes, int compress_type, + int type) +{ + struct extent_map *em; + int ret; + + ASSERT(type == BTRFS_ORDERED_PREALLOC || + type == BTRFS_ORDERED_COMPRESSED || + type == BTRFS_ORDERED_NOCOW || + type == BTRFS_ORDERED_REGULAR); + + em = alloc_extent_map(); + if (!em) + return ERR_PTR(-ENOMEM); + + em->start = start; + em->orig_start = orig_start; + em->len = len; + em->block_len = block_len; + em->block_start = block_start; + em->orig_block_len = orig_block_len; + em->ram_bytes = ram_bytes; + em->generation = -1; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + if (type == BTRFS_ORDERED_PREALLOC) { + set_bit(EXTENT_FLAG_FILLING, &em->flags); + } else if (type == BTRFS_ORDERED_COMPRESSED) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + } + + ret = btrfs_replace_extent_map_range(inode, em, true); + if (ret) { + free_extent_map(em); + return ERR_PTR(ret); + } + + /* em got 2 refs now, callers needs to do free_extent_map once. */ + return em; +} + + +static int btrfs_get_blocks_direct_write(struct extent_map **map, + struct inode *inode, + struct btrfs_dio_data *dio_data, + u64 start, u64 *lenp, + unsigned int iomap_flags) +{ + const bool nowait = (iomap_flags & IOMAP_NOWAIT); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct extent_map *em = *map; + int type; + u64 block_start, orig_start, orig_block_len, ram_bytes; + struct btrfs_block_group *bg; + bool can_nocow = false; + bool space_reserved = false; + u64 len = *lenp; + u64 prev_len; + int ret = 0; + + /* + * We don't allocate a new extent in the following cases + * + * 1) The inode is marked as NODATACOW. In this case we'll just use the + * existing extent. + * 2) The extent is marked as PREALLOC. We're good to go here and can + * just use the extent. + * + */ + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + em->block_start != EXTENT_MAP_HOLE)) { + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + type = BTRFS_ORDERED_PREALLOC; + else + type = BTRFS_ORDERED_NOCOW; + len = min(len, em->len - (start - em->start)); + block_start = em->block_start + (start - em->start); + + if (can_nocow_extent(inode, start, &len, &orig_start, + &orig_block_len, &ram_bytes, false, false) == 1) { + bg = btrfs_inc_nocow_writers(fs_info, block_start); + if (bg) + can_nocow = true; + } + } + + prev_len = len; + if (can_nocow) { + struct extent_map *em2; + + /* We can NOCOW, so only need to reserve metadata space. */ + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, + nowait); + if (ret < 0) { + /* Our caller expects us to free the input extent map. */ + free_extent_map(em); + *map = NULL; + btrfs_dec_nocow_writers(bg); + if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) + ret = -EAGAIN; + goto out; + } + space_reserved = true; + + em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len, + orig_start, block_start, + len, orig_block_len, + ram_bytes, type); + btrfs_dec_nocow_writers(bg); + if (type == BTRFS_ORDERED_PREALLOC) { + free_extent_map(em); + *map = em2; + em = em2; + } + + if (IS_ERR(em2)) { + ret = PTR_ERR(em2); + goto out; + } + + dio_data->nocow_done = true; + } else { + /* Our caller expects us to free the input extent map. */ + free_extent_map(em); + *map = NULL; + + if (nowait) { + ret = -EAGAIN; + goto out; + } + + /* + * If we could not allocate data space before locking the file + * range and we can't do a NOCOW write, then we have to fail. + */ + if (!dio_data->data_space_reserved) { + ret = -ENOSPC; + goto out; + } + + /* + * We have to COW and we have already reserved data space before, + * so now we reserve only metadata. + */ + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, + false); + if (ret < 0) + goto out; + space_reserved = true; + + em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + *map = em; + len = min(len, em->len - (start - em->start)); + if (len < prev_len) + btrfs_delalloc_release_metadata(BTRFS_I(inode), + prev_len - len, true); + } + + /* + * We have created our ordered extent, so we can now release our reservation + * for an outstanding extent. + */ + btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); + + /* + * Need to update the i_size under the extent lock so buffered + * readers will get the updated i_size when we unlock. + */ + if (start + len > i_size_read(inode)) + i_size_write(inode, start + len); +out: + if (ret && space_reserved) { + btrfs_delalloc_release_extents(BTRFS_I(inode), len); + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); + } + *lenp = len; + return ret; +} + +static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + loff_t length, unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct extent_map *em; + struct extent_state *cached_state = NULL; + struct btrfs_dio_data *dio_data = iter->private; + u64 lockstart, lockend; + const bool write = !!(flags & IOMAP_WRITE); + int ret = 0; + u64 len = length; + const u64 data_alloc_len = length; + bool unlock_extents = false; + + /* + * We could potentially fault if we have a buffer > PAGE_SIZE, and if + * we're NOWAIT we may submit a bio for a partial range and return + * EIOCBQUEUED, which would result in an errant short read. + * + * The best way to handle this would be to allow for partial completions + * of iocb's, so we could submit the partial bio, return and fault in + * the rest of the pages, and then submit the io for the rest of the + * range. However we don't have that currently, so simply return + * -EAGAIN at this point so that the normal path is used. + */ + if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) + return -EAGAIN; + + /* + * Cap the size of reads to that usually seen in buffered I/O as we need + * to allocate a contiguous array for the checksums. + */ + if (!write) + len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); + + lockstart = start; + lockend = start + len - 1; + + /* + * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't + * enough if we've written compressed pages to this area, so we need to + * flush the dirty pages again to make absolutely sure that any + * outstanding dirty pages are on disk - the first flush only starts + * compression on the data, while keeping the pages locked, so by the + * time the second flush returns we know bios for the compressed pages + * were submitted and finished, and the pages no longer under writeback. + * + * If we have a NOWAIT request and we have any pages in the range that + * are locked, likely due to compression still in progress, we don't want + * to block on page locks. We also don't want to block on pages marked as + * dirty or under writeback (same as for the non-compression case). + * iomap_dio_rw() did the same check, but after that and before we got + * here, mmap'ed writes may have happened or buffered reads started + * (readpage() and readahead(), which lock pages), as we haven't locked + * the file range yet. + */ + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) { + if (flags & IOMAP_NOWAIT) { + if (filemap_range_needs_writeback(inode->i_mapping, + lockstart, lockend)) + return -EAGAIN; + } else { + ret = filemap_fdatawrite_range(inode->i_mapping, start, + start + length - 1); + if (ret) + return ret; + } + } + + memset(dio_data, 0, sizeof(*dio_data)); + + /* + * We always try to allocate data space and must do it before locking + * the file range, to avoid deadlocks with concurrent writes to the same + * range if the range has several extents and the writes don't expand the + * current i_size (the inode lock is taken in shared mode). If we fail to + * allocate data space here we continue and later, after locking the + * file range, we fail with ENOSPC only if we figure out we can not do a + * NOCOW write. + */ + if (write && !(flags & IOMAP_NOWAIT)) { + ret = btrfs_check_data_free_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, data_alloc_len, false); + if (!ret) + dio_data->data_space_reserved = true; + else if (ret && !(BTRFS_I(inode)->flags & + (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + goto err; + } + + /* + * If this errors out it's because we couldn't invalidate pagecache for + * this range and we need to fallback to buffered IO, or we are doing a + * NOWAIT read/write and we need to block. + */ + ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); + if (ret < 0) + goto err; + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto unlock_err; + } + + /* + * Ok for INLINE and COMPRESSED extents we need to fallback on buffered + * io. INLINE is special, and we could probably kludge it in here, but + * it's still buffered so for safety lets just fall back to the generic + * buffered path. + * + * For COMPRESSED we _have_ to read the entire extent in so we can + * decompress it, so there will be buffering required no matter what we + * do, so go ahead and fallback to buffered. + * + * We return -ENOTBLK because that's what makes DIO go ahead and go back + * to buffered IO. Don't blame me, this is the price we pay for using + * the generic code. + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || + em->block_start == EXTENT_MAP_INLINE) { + free_extent_map(em); + /* + * If we are in a NOWAIT context, return -EAGAIN in order to + * fallback to buffered IO. This is not only because we can + * block with buffered IO (no support for NOWAIT semantics at + * the moment) but also to avoid returning short reads to user + * space - this happens if we were able to read some data from + * previous non-compressed extents and then when we fallback to + * buffered IO, at btrfs_file_read_iter() by calling + * filemap_read(), we fail to fault in pages for the read buffer, + * in which case filemap_read() returns a short read (the number + * of bytes previously read is > 0, so it does not return -EFAULT). + */ + ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; + goto unlock_err; + } + + len = min(len, em->len - (start - em->start)); + + /* + * If we have a NOWAIT request and the range contains multiple extents + * (or a mix of extents and holes), then we return -EAGAIN to make the + * caller fallback to a context where it can do a blocking (without + * NOWAIT) request. This way we avoid doing partial IO and returning + * success to the caller, which is not optimal for writes and for reads + * it can result in unexpected behaviour for an application. + * + * When doing a read, because we use IOMAP_DIO_PARTIAL when calling + * iomap_dio_rw(), we can end up returning less data then what the caller + * asked for, resulting in an unexpected, and incorrect, short read. + * That is, the caller asked to read N bytes and we return less than that, + * which is wrong unless we are crossing EOF. This happens if we get a + * page fault error when trying to fault in pages for the buffer that is + * associated to the struct iov_iter passed to iomap_dio_rw(), and we + * have previously submitted bios for other extents in the range, in + * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of + * those bios have completed by the time we get the page fault error, + * which we return back to our caller - we should only return EIOCBQUEUED + * after we have submitted bios for all the extents in the range. + */ + if ((flags & IOMAP_NOWAIT) && len < length) { + free_extent_map(em); + ret = -EAGAIN; + goto unlock_err; + } + + if (write) { + ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, + start, &len, flags); + if (ret < 0) + goto unlock_err; + unlock_extents = true; + /* Recalc len in case the new em is smaller than requested */ + len = min(len, em->len - (start - em->start)); + if (dio_data->data_space_reserved) { + u64 release_offset; + u64 release_len = 0; + + if (dio_data->nocow_done) { + release_offset = start; + release_len = data_alloc_len; + } else if (len < data_alloc_len) { + release_offset = start + len; + release_len = data_alloc_len - len; + } + + if (release_len > 0) + btrfs_free_reserved_data_space(BTRFS_I(inode), + dio_data->data_reserved, + release_offset, + release_len); + } + } else { + /* + * We need to unlock only the end area that we aren't using. + * The rest is going to be unlocked by the endio routine. + */ + lockstart = start + len; + if (lockstart < lockend) + unlock_extents = true; + } + + if (unlock_extents) + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); + else + free_extent_state(cached_state); + + /* + * Translate extent map information to iomap. + * We trim the extents (and move the addr) even though iomap code does + * that, since we have locked only the parts we are performing I/O in. + */ + if ((em->block_start == EXTENT_MAP_HOLE) || + (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + } else { + iomap->addr = em->block_start + (start - em->start); + iomap->type = IOMAP_MAPPED; + } + iomap->offset = start; + iomap->bdev = fs_info->fs_devices->latest_dev->bdev; + iomap->length = len; + free_extent_map(em); + + return 0; + +unlock_err: + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); +err: + if (dio_data->data_space_reserved) { + btrfs_free_reserved_data_space(BTRFS_I(inode), + dio_data->data_reserved, + start, data_alloc_len); + extent_changeset_free(dio_data->data_reserved); + } + + return ret; +} + +static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, struct iomap *iomap) +{ + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); + struct btrfs_dio_data *dio_data = iter->private; + size_t submitted = dio_data->submitted; + const bool write = !!(flags & IOMAP_WRITE); + int ret = 0; + + if (!write && (iomap->type == IOMAP_HOLE)) { + /* If reading from a hole, unlock and return */ + unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, + NULL); + return 0; + } + + if (submitted < length) { + pos += submitted; + length -= submitted; + if (write) + btrfs_finish_ordered_extent(dio_data->ordered, NULL, + pos, length, false); + else + unlock_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); + ret = -ENOTBLK; + } + if (write) { + btrfs_put_ordered_extent(dio_data->ordered); + dio_data->ordered = NULL; + } + + if (write) + extent_changeset_free(dio_data->data_reserved); + return ret; +} + +static void btrfs_dio_end_io(struct btrfs_bio *bbio) +{ + struct btrfs_dio_private *dip = + container_of(bbio, struct btrfs_dio_private, bbio); + struct btrfs_inode *inode = bbio->inode; + struct bio *bio = &bbio->bio; + + if (bio->bi_status) { + btrfs_warn(inode->root->fs_info, + "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", + btrfs_ino(inode), bio->bi_opf, + dip->file_offset, dip->bytes, bio->bi_status); + } + + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + btrfs_finish_ordered_extent(bbio->ordered, NULL, + dip->file_offset, dip->bytes, + !bio->bi_status); + } else { + unlock_extent(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, NULL); + } + + bbio->bio.bi_private = bbio->private; + iomap_dio_bio_end_io(bio); +} + +static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, + loff_t file_offset) +{ + struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_dio_private *dip = + container_of(bbio, struct btrfs_dio_private, bbio); + struct btrfs_dio_data *dio_data = iter->private; + + btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, + btrfs_dio_end_io, bio->bi_private); + bbio->inode = BTRFS_I(iter->inode); + bbio->file_offset = file_offset; + + dip->file_offset = file_offset; + dip->bytes = bio->bi_iter.bi_size; + + dio_data->submitted += bio->bi_iter.bi_size; + + /* + * Check if we are doing a partial write. If we are, we need to split + * the ordered extent to match the submitted bio. Hang on to the + * remaining unfinishable ordered_extent in dio_data so that it can be + * cancelled in iomap_end to avoid a deadlock wherein faulting the + * remaining pages is blocked on the outstanding ordered extent. + */ + if (iter->flags & IOMAP_WRITE) { + int ret; + + ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); + if (ret) { + btrfs_finish_ordered_extent(dio_data->ordered, NULL, + file_offset, dip->bytes, + !ret); + bio->bi_status = errno_to_blk_status(ret); + iomap_dio_bio_end_io(bio); + return; + } + } + + btrfs_submit_bio(bbio, 0); +} + +static const struct iomap_ops btrfs_dio_iomap_ops = { + .iomap_begin = btrfs_dio_iomap_begin, + .iomap_end = btrfs_dio_iomap_end, +}; + +static const struct iomap_dio_ops btrfs_dio_ops = { + .submit_io = btrfs_dio_submit_io, + .bio_set = &btrfs_dio_bioset, +}; + +ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) +{ + struct btrfs_dio_data data = { 0 }; + + return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, &data, done_before); +} + +struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before) +{ + struct btrfs_dio_data data = { 0 }; + + return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, &data, done_before); +} + +static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + int ret; + + ret = fiemap_prep(inode, fieinfo, start, &len, 0); + if (ret) + return ret; + + /* + * fiemap_prep() called filemap_write_and_wait() for the whole possible + * file range (0 to LLONG_MAX), but that is not enough if we have + * compression enabled. The first filemap_fdatawrite_range() only kicks + * in the compression of data (in an async thread) and will return + * before the compression is done and writeback is started. A second + * filemap_fdatawrite_range() is needed to wait for the compression to + * complete and writeback to start. We also need to wait for ordered + * extents to complete, because our fiemap implementation uses mainly + * file extent items to list the extents, searching for extent maps + * only for file ranges with holes or prealloc extents to figure out + * if we have delalloc in those ranges. + */ + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { + ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); + if (ret) + return ret; + } + + return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); +} + +static int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return extent_writepages(mapping, wbc); +} + +static void btrfs_readahead(struct readahead_control *rac) +{ + extent_readahead(rac); +} + +/* + * For release_folio() and invalidate_folio() we have a race window where + * folio_end_writeback() is called but the subpage spinlock is not yet released. + * If we continue to release/invalidate the page, we could cause use-after-free + * for subpage spinlock. So this function is to spin and wait for subpage + * spinlock. + */ +static void wait_subpage_spinlock(struct page *page) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_subpage *subpage; + + if (!btrfs_is_subpage(fs_info, page)) + return; + + ASSERT(PagePrivate(page) && page->private); + subpage = (struct btrfs_subpage *)page->private; + + /* + * This may look insane as we just acquire the spinlock and release it, + * without doing anything. But we just want to make sure no one is + * still holding the subpage spinlock. + * And since the page is not dirty nor writeback, and we have page + * locked, the only possible way to hold a spinlock is from the endio + * function to clear page writeback. + * + * Here we just acquire the spinlock so that all existing callers + * should exit and we're safe to release/invalidate the page. + */ + spin_lock_irq(&subpage->lock); + spin_unlock_irq(&subpage->lock); +} + +static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) +{ + int ret = try_release_extent_mapping(&folio->page, gfp_flags); + + if (ret == 1) { + wait_subpage_spinlock(&folio->page); + clear_page_extent_mapped(&folio->page); + } + return ret; +} + +static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) +{ + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return false; + return __btrfs_release_folio(folio, gfp_flags); +} + +#ifdef CONFIG_MIGRATION +static int btrfs_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, + enum migrate_mode mode) +{ + int ret = filemap_migrate_folio(mapping, dst, src, mode); + + if (ret != MIGRATEPAGE_SUCCESS) + return ret; + + if (folio_test_ordered(src)) { + folio_clear_ordered(src); + folio_set_ordered(dst); + } + + return MIGRATEPAGE_SUCCESS; +} +#else +#define btrfs_migrate_folio NULL +#endif + +static void btrfs_invalidate_folio(struct folio *folio, size_t offset, + size_t length) +{ + struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *tree = &inode->io_tree; + struct extent_state *cached_state = NULL; + u64 page_start = folio_pos(folio); + u64 page_end = page_start + folio_size(folio) - 1; + u64 cur; + int inode_evicting = inode->vfs_inode.i_state & I_FREEING; + + /* + * We have folio locked so no new ordered extent can be created on this + * page, nor bio can be submitted for this folio. + * + * But already submitted bio can still be finished on this folio. + * Furthermore, endio function won't skip folio which has Ordered + * (Private2) already cleared, so it's possible for endio and + * invalidate_folio to do the same ordered extent accounting twice + * on one folio. + * + * So here we wait for any submitted bios to finish, so that we won't + * do double ordered extent accounting on the same folio. + */ + folio_wait_writeback(folio); + wait_subpage_spinlock(&folio->page); + + /* + * For subpage case, we have call sites like + * btrfs_punch_hole_lock_range() which passes range not aligned to + * sectorsize. + * If the range doesn't cover the full folio, we don't need to and + * shouldn't clear page extent mapped, as folio->private can still + * record subpage dirty bits for other part of the range. + * + * For cases that invalidate the full folio even the range doesn't + * cover the full folio, like invalidating the last folio, we're + * still safe to wait for ordered extent to finish. + */ + if (!(offset == 0 && length == folio_size(folio))) { + btrfs_release_folio(folio, GFP_NOFS); + return; + } + + if (!inode_evicting) + lock_extent(tree, page_start, page_end, &cached_state); + + cur = page_start; + while (cur < page_end) { + struct btrfs_ordered_extent *ordered; + u64 range_end; + u32 range_len; + u32 extra_flags = 0; + + ordered = btrfs_lookup_first_ordered_range(inode, cur, + page_end + 1 - cur); + if (!ordered) { + range_end = page_end; + /* + * No ordered extent covering this range, we are safe + * to delete all extent states in the range. + */ + extra_flags = EXTENT_CLEAR_ALL_BITS; + goto next; + } + if (ordered->file_offset > cur) { + /* + * There is a range between [cur, oe->file_offset) not + * covered by any ordered extent. + * We are safe to delete all extent states, and handle + * the ordered extent in the next iteration. + */ + range_end = ordered->file_offset - 1; + extra_flags = EXTENT_CLEAR_ALL_BITS; + goto next; + } + + range_end = min(ordered->file_offset + ordered->num_bytes - 1, + page_end); + ASSERT(range_end + 1 - cur < U32_MAX); + range_len = range_end + 1 - cur; + if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) { + /* + * If Ordered (Private2) is cleared, it means endio has + * already been executed for the range. + * We can't delete the extent states as + * btrfs_finish_ordered_io() may still use some of them. + */ + goto next; + } + btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len); + + /* + * IO on this page will never be started, so we need to account + * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW + * here, must leave that up for the ordered extent completion. + * + * This will also unlock the range for incoming + * btrfs_finish_ordered_io(). + */ + if (!inode_evicting) + clear_extent_bit(tree, cur, range_end, + EXTENT_DELALLOC | + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, &cached_state); + + spin_lock_irq(&inode->ordered_tree.lock); + set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); + ordered->truncated_len = min(ordered->truncated_len, + cur - ordered->file_offset); + spin_unlock_irq(&inode->ordered_tree.lock); + + /* + * If the ordered extent has finished, we're safe to delete all + * the extent states of the range, otherwise + * btrfs_finish_ordered_io() will get executed by endio for + * other pages, so we can't delete extent states. + */ + if (btrfs_dec_test_ordered_pending(inode, &ordered, + cur, range_end + 1 - cur)) { + btrfs_finish_ordered_io(ordered); + /* + * The ordered extent has finished, now we're again + * safe to delete all extent states of the range. + */ + extra_flags = EXTENT_CLEAR_ALL_BITS; + } +next: + if (ordered) + btrfs_put_ordered_extent(ordered); + /* + * Qgroup reserved space handler + * Sector(s) here will be either: + * + * 1) Already written to disk or bio already finished + * Then its QGROUP_RESERVED bit in io_tree is already cleared. + * Qgroup will be handled by its qgroup_record then. + * btrfs_qgroup_free_data() call will do nothing here. + * + * 2) Not written to disk yet + * Then btrfs_qgroup_free_data() call will clear the + * QGROUP_RESERVED bit of its io_tree, and free the qgroup + * reserved data space. + * Since the IO will never happen for this page. + */ + btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); + if (!inode_evicting) { + clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_UPTODATE | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG | + extra_flags, &cached_state); + } + cur = range_end + 1; + } + /* + * We have iterated through all ordered extents of the page, the page + * should not have Ordered (Private2) anymore, or the above iteration + * did something wrong. + */ + ASSERT(!folio_test_ordered(folio)); + btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio)); + if (!inode_evicting) + __btrfs_release_folio(folio, GFP_NOFS); + clear_page_extent_mapped(&folio->page); +} + +/* + * btrfs_page_mkwrite() is not allowed to change the file size as it gets + * called from a page fault handler when a page is first dirtied. Hence we must + * be careful to check for EOF conditions here. We set the page up correctly + * for a written page which means we get ENOSPC checking when writing into + * holes and correct delalloc and unwritten extent mapping on filesystems that + * support these features. + * + * We are not allowed to take the i_mutex here so we have to play games to + * protect against truncate races as the page could now be beyond EOF. Because + * truncate_setsize() writes the inode size before removing pages, once we have + * the page lock we can determine safely if the page is beyond EOF. If it is not + * beyond EOF, then the page is guaranteed safe against truncation until we + * unlock the page. + */ +vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vmf->vma->vm_file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; + unsigned long zero_start; + loff_t size; + vm_fault_t ret; + int ret2; + int reserved = 0; + u64 reserved_space; + u64 page_start; + u64 page_end; + u64 end; + + reserved_space = PAGE_SIZE; + + sb_start_pagefault(inode->i_sb); + page_start = page_offset(page); + page_end = page_start + PAGE_SIZE - 1; + end = page_end; + + /* + * Reserving delalloc space after obtaining the page lock can lead to + * deadlock. For example, if a dirty page is locked by this function + * and the call to btrfs_delalloc_reserve_space() ends up triggering + * dirty page write out, then the btrfs_writepages() function could + * end up waiting indefinitely to get a lock on the page currently + * being processed by btrfs_page_mkwrite() function. + */ + ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, + page_start, reserved_space); + if (!ret2) { + ret2 = file_update_time(vmf->vma->vm_file); + reserved = 1; + } + if (ret2) { + ret = vmf_error(ret2); + if (reserved) + goto out; + goto out_noreserve; + } + + ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ +again: + down_read(&BTRFS_I(inode)->i_mmap_lock); + lock_page(page); + size = i_size_read(inode); + + if ((page->mapping != inode->i_mapping) || + (page_start >= size)) { + /* page got truncated out from underneath us */ + goto out_unlock; + } + wait_on_page_writeback(page); + + lock_extent(io_tree, page_start, page_end, &cached_state); + ret2 = set_page_extent_mapped(page); + if (ret2 < 0) { + ret = vmf_error(ret2); + unlock_extent(io_tree, page_start, page_end, &cached_state); + goto out_unlock; + } + + /* + * we can't set the delalloc bits if there are pending ordered + * extents. Drop our locks and wait for them to finish + */ + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, + PAGE_SIZE); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, &cached_state); + unlock_page(page); + up_read(&BTRFS_I(inode)->i_mmap_lock); + btrfs_start_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + goto again; + } + + if (page->index == ((size - 1) >> PAGE_SHIFT)) { + reserved_space = round_up(size - page_start, + fs_info->sectorsize); + if (reserved_space < PAGE_SIZE) { + end = page_start + reserved_space - 1; + btrfs_delalloc_release_space(BTRFS_I(inode), + data_reserved, page_start, + PAGE_SIZE - reserved_space, true); + } + } + + /* + * page_mkwrite gets called when the page is firstly dirtied after it's + * faulted in, but write(2) could also dirty a page and set delalloc + * bits, thus in this case for space account reason, we still need to + * clear any delalloc bits within this page range since we have to + * reserve data&meta space before lock_page() (see above comments). + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, &cached_state); + + ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, + &cached_state); + if (ret2) { + unlock_extent(io_tree, page_start, page_end, &cached_state); + ret = VM_FAULT_SIGBUS; + goto out_unlock; + } + + /* page is wholly or partially inside EOF */ + if (page_start + PAGE_SIZE > size) + zero_start = offset_in_page(size); + else + zero_start = PAGE_SIZE; + + if (zero_start != PAGE_SIZE) + memzero_page(page, zero_start, PAGE_SIZE - zero_start); + + btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); + btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); + btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); + + btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); + + unlock_extent(io_tree, page_start, page_end, &cached_state); + up_read(&BTRFS_I(inode)->i_mmap_lock); + + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); + return VM_FAULT_LOCKED; + +out_unlock: + unlock_page(page); + up_read(&BTRFS_I(inode)->i_mmap_lock); +out: + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, + reserved_space, (ret != 0)); +out_noreserve: + sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); + return ret; +} + +static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) +{ + struct btrfs_truncate_control control = { + .inode = inode, + .ino = btrfs_ino(inode), + .min_type = BTRFS_EXTENT_DATA_KEY, + .clear_extent_range = true, + }; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *rsv; + int ret; + struct btrfs_trans_handle *trans; + u64 mask = fs_info->sectorsize - 1; + const u64 min_size = btrfs_calc_metadata_size(fs_info, 1); + + if (!skip_writeback) { + ret = btrfs_wait_ordered_range(&inode->vfs_inode, + inode->vfs_inode.i_size & (~mask), + (u64)-1); + if (ret) + return ret; + } + + /* + * Yes ladies and gentlemen, this is indeed ugly. We have a couple of + * things going on here: + * + * 1) We need to reserve space to update our inode. + * + * 2) We need to have something to cache all the space that is going to + * be free'd up by the truncate operation, but also have some slack + * space reserved in case it uses space during the truncate (thank you + * very much snapshotting). + * + * And we need these to be separate. The fact is we can use a lot of + * space doing the truncate, and we have no earthly idea how much space + * we will use, so we need the truncate reservation to be separate so it + * doesn't end up using space reserved for updating the inode. We also + * need to be able to stop the transaction and start a new one, which + * means we need to be able to update the inode several times, and we + * have no idea of knowing how many times that will be, so we can't just + * reserve 1 item for the entirety of the operation, so that has to be + * done separately as well. + * + * So that leaves us with + * + * 1) rsv - for the truncate reservation, which we will steal from the + * transaction reservation. + * 2) fs_info->trans_block_rsv - this will have 1 items worth left for + * updating the inode. + */ + rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); + if (!rsv) + return -ENOMEM; + rsv->size = min_size; + rsv->failfast = true; + + /* + * 1 for the truncate slack space + * 1 for updating the inode. + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + /* Migrate the slack space for the truncate to our reserve */ + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, + min_size, false); + /* + * We have reserved 2 metadata units when we started the transaction and + * min_size matches 1 unit, so this should never fail, but if it does, + * it's not critical we just fail truncation. + */ + if (WARN_ON(ret)) { + btrfs_end_transaction(trans); + goto out; + } + + trans->block_rsv = rsv; + + while (1) { + struct extent_state *cached_state = NULL; + const u64 new_size = inode->vfs_inode.i_size; + const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); + + control.new_size = new_size; + lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); + /* + * We want to drop from the next block forward in case this new + * size is not block aligned since we will be keeping the last + * block of the extent just the way it is. + */ + btrfs_drop_extent_map_range(inode, + ALIGN(new_size, fs_info->sectorsize), + (u64)-1, false); + + ret = btrfs_truncate_inode_items(trans, root, &control); + + inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); + btrfs_inode_safe_disk_i_size_write(inode, control.last_size); + + unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); + + trans->block_rsv = &fs_info->trans_block_rsv; + if (ret != -ENOSPC && ret != -EAGAIN) + break; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) + break; + + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); + + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + break; + } + + btrfs_block_rsv_release(fs_info, rsv, -1, NULL); + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, + rsv, min_size, false); + /* + * We have reserved 2 metadata units when we started the + * transaction and min_size matches 1 unit, so this should never + * fail, but if it does, it's not critical we just fail truncation. + */ + if (WARN_ON(ret)) + break; + + trans->block_rsv = rsv; + } + + /* + * We can't call btrfs_truncate_block inside a trans handle as we could + * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we + * know we've truncated everything except the last little bit, and can + * do btrfs_truncate_block and then update the disk_i_size. + */ + if (ret == BTRFS_NEED_TRUNCATE_BLOCK) { + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); + + ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0); + if (ret) + goto out; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + btrfs_inode_safe_disk_i_size_write(inode, 0); + } + + if (trans) { + int ret2; + + trans->block_rsv = &fs_info->trans_block_rsv; + ret2 = btrfs_update_inode(trans, root, inode); + if (ret2 && !ret) + ret = ret2; + + ret2 = btrfs_end_transaction(trans); + if (ret2 && !ret) + ret = ret2; + btrfs_btree_balance_dirty(fs_info); + } +out: + btrfs_free_block_rsv(fs_info, rsv); + /* + * So if we truncate and then write and fsync we normally would just + * write the extents that changed, which is a problem if we need to + * first truncate that entire inode. So set this flag so we write out + * all of the extents in the inode to the sync log so we're completely + * safe. + * + * If no extents were dropped or trimmed we don't need to force the next + * fsync to truncate all the inode's items from the log and re-log them + * all. This means the truncate operation did not change the file size, + * or changed it to a smaller size but there was only an implicit hole + * between the old i_size and the new i_size, and there were no prealloc + * extents beyond i_size to drop. + */ + if (control.extents_found > 0) + btrfs_set_inode_full_sync(inode); + + return ret; +} + +struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap, + struct inode *dir) +{ + struct inode *inode; + + inode = new_inode(dir->i_sb); + if (inode) { + /* + * Subvolumes don't inherit the sgid bit or the parent's gid if + * the parent's sgid bit is set. This is probably a bug. + */ + inode_init_owner(idmap, inode, NULL, + S_IFDIR | (~current_umask() & S_IRWXUGO)); + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + } + return inode; +} + +struct inode *btrfs_alloc_inode(struct super_block *sb) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_inode *ei; + struct inode *inode; + + ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + + ei->root = NULL; + ei->generation = 0; + ei->last_trans = 0; + ei->last_sub_trans = 0; + ei->logged_trans = 0; + ei->delalloc_bytes = 0; + ei->new_delalloc_bytes = 0; + ei->defrag_bytes = 0; + ei->disk_i_size = 0; + ei->flags = 0; + ei->ro_flags = 0; + ei->csum_bytes = 0; + ei->index_cnt = (u64)-1; + ei->dir_index = 0; + ei->last_unlink_trans = 0; + ei->last_reflink_trans = 0; + ei->last_log_commit = 0; + + spin_lock_init(&ei->lock); + ei->outstanding_extents = 0; + if (sb->s_magic != BTRFS_TEST_MAGIC) + btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, + BTRFS_BLOCK_RSV_DELALLOC); + ei->runtime_flags = 0; + ei->prop_compress = BTRFS_COMPRESS_NONE; + ei->defrag_compress = BTRFS_COMPRESS_NONE; + + ei->delayed_node = NULL; + + ei->i_otime.tv_sec = 0; + ei->i_otime.tv_nsec = 0; + + inode = &ei->vfs_inode; + extent_map_tree_init(&ei->extent_tree); + extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); + ei->io_tree.inode = ei; + extent_io_tree_init(fs_info, &ei->file_extent_tree, + IO_TREE_INODE_FILE_EXTENT); + mutex_init(&ei->log_mutex); + btrfs_ordered_inode_tree_init(&ei->ordered_tree); + INIT_LIST_HEAD(&ei->delalloc_inodes); + INIT_LIST_HEAD(&ei->delayed_iput); + RB_CLEAR_NODE(&ei->rb_node); + init_rwsem(&ei->i_mmap_lock); + + return inode; +} + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_destroy_inode(struct inode *inode) +{ + btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +} +#endif + +void btrfs_free_inode(struct inode *inode) +{ + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +} + +void btrfs_destroy_inode(struct inode *vfs_inode) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_root *root = inode->root; + bool freespace_inode; + + WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); + WARN_ON(vfs_inode->i_data.nrpages); + WARN_ON(inode->block_rsv.reserved); + WARN_ON(inode->block_rsv.size); + WARN_ON(inode->outstanding_extents); + if (!S_ISDIR(vfs_inode->i_mode)) { + WARN_ON(inode->delalloc_bytes); + WARN_ON(inode->new_delalloc_bytes); + } + WARN_ON(inode->csum_bytes); + WARN_ON(inode->defrag_bytes); + + /* + * This can happen where we create an inode, but somebody else also + * created the same inode and we need to destroy the one we already + * created. + */ + if (!root) + return; + + /* + * If this is a free space inode do not take the ordered extents lockdep + * map. + */ + freespace_inode = btrfs_is_free_space_inode(inode); + + while (1) { + ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); + if (!ordered) + break; + else { + btrfs_err(root->fs_info, + "found ordered extent %llu %llu on inode cleanup", + ordered->file_offset, ordered->num_bytes); + + if (!freespace_inode) + btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent); + + btrfs_remove_ordered_extent(inode, ordered); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } + } + btrfs_qgroup_check_reserved_leak(inode); + inode_tree_del(inode); + btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); + btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); + btrfs_put_root(inode->root); +} + +int btrfs_drop_inode(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (root == NULL) + return 1; + + /* the snap/subvol tree is on deleting */ + if (btrfs_root_refs(&root->root_item) == 0) + return 1; + else + return generic_drop_inode(inode); +} + +static void init_once(void *foo) +{ + struct btrfs_inode *ei = foo; + + inode_init_once(&ei->vfs_inode); +} + +void __cold btrfs_destroy_cachep(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + bioset_exit(&btrfs_dio_bioset); + kmem_cache_destroy(btrfs_inode_cachep); +} + +int __init btrfs_init_cachep(void) +{ + btrfs_inode_cachep = kmem_cache_create("btrfs_inode", + sizeof(struct btrfs_inode), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + init_once); + if (!btrfs_inode_cachep) + goto fail; + + if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_dio_private, bbio.bio), + BIOSET_NEED_BVECS)) + goto fail; + + return 0; +fail: + btrfs_destroy_cachep(); + return -ENOMEM; +} + +static int btrfs_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + u64 delalloc_bytes; + u64 inode_bytes; + struct inode *inode = d_inode(path->dentry); + u32 blocksize = inode->i_sb->s_blocksize; + u32 bi_flags = BTRFS_I(inode)->flags; + u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; + + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; + stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec; + if (bi_flags & BTRFS_INODE_APPEND) + stat->attributes |= STATX_ATTR_APPEND; + if (bi_flags & BTRFS_INODE_COMPRESS) + stat->attributes |= STATX_ATTR_COMPRESSED; + if (bi_flags & BTRFS_INODE_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (bi_flags & BTRFS_INODE_NODUMP) + stat->attributes |= STATX_ATTR_NODUMP; + if (bi_ro_flags & BTRFS_INODE_RO_VERITY) + stat->attributes |= STATX_ATTR_VERITY; + + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_COMPRESSED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); + + generic_fillattr(idmap, request_mask, inode, stat); + stat->dev = BTRFS_I(inode)->root->anon_dev; + + spin_lock(&BTRFS_I(inode)->lock); + delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; + inode_bytes = inode_get_bytes(inode); + spin_unlock(&BTRFS_I(inode)->lock); + stat->blocks = (ALIGN(inode_bytes, blocksize) + + ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT; + return 0; +} + +static int btrfs_rename_exchange(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); + struct btrfs_trans_handle *trans; + unsigned int trans_num_items; + struct btrfs_root *root = BTRFS_I(old_dir)->root; + struct btrfs_root *dest = BTRFS_I(new_dir)->root; + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + struct btrfs_rename_ctx old_rename_ctx; + struct btrfs_rename_ctx new_rename_ctx; + u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); + u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); + u64 old_idx = 0; + u64 new_idx = 0; + int ret; + int ret2; + bool need_abort = false; + struct fscrypt_name old_fname, new_fname; + struct fscrypt_str *old_name, *new_name; + + /* + * For non-subvolumes allow exchange only within one subvolume, in the + * same inode namespace. Two subvolumes (represented as directory) can + * be exchanged as they're a logical link and have a fixed inode number. + */ + if (root != dest && + (old_ino != BTRFS_FIRST_FREE_OBJECTID || + new_ino != BTRFS_FIRST_FREE_OBJECTID)) + return -EXDEV; + + ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); + if (ret) + return ret; + + ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); + if (ret) { + fscrypt_free_filename(&old_fname); + return ret; + } + + old_name = &old_fname.disk_name; + new_name = &new_fname.disk_name; + + /* close the race window with snapshot create/destroy ioctl */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID || + new_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&fs_info->subvol_sem); + + /* + * For each inode: + * 1 to remove old dir item + * 1 to remove old dir index + * 1 to add new dir item + * 1 to add new dir index + * 1 to update parent inode + * + * If the parents are the same, we only need to account for one + */ + trans_num_items = (old_dir == new_dir ? 9 : 10); + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* + * 1 to remove old root ref + * 1 to remove old root backref + * 1 to add new root ref + * 1 to add new root backref + */ + trans_num_items += 4; + } else { + /* + * 1 to update inode item + * 1 to remove old inode ref + * 1 to add new inode ref + */ + trans_num_items += 3; + } + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + trans_num_items += 4; + else + trans_num_items += 3; + trans = btrfs_start_transaction(root, trans_num_items); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_notrans; + } + + if (dest != root) { + ret = btrfs_record_root_in_trans(trans, dest); + if (ret) + goto out_fail; + } + + /* + * We need to find a free sequence number both in the source and + * in the destination directory for the exchange. + */ + ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx); + if (ret) + goto out_fail; + ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx); + if (ret) + goto out_fail; + + BTRFS_I(old_inode)->dir_index = 0ULL; + BTRFS_I(new_inode)->dir_index = 0ULL; + + /* Reference for the source. */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* force full log commit if subvolume involved. */ + btrfs_set_log_full_commit(trans); + } else { + ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino, + btrfs_ino(BTRFS_I(new_dir)), + old_idx); + if (ret) + goto out_fail; + need_abort = true; + } + + /* And now for the dest. */ + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* force full log commit if subvolume involved. */ + btrfs_set_log_full_commit(trans); + } else { + ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino, + btrfs_ino(BTRFS_I(old_dir)), + new_idx); + if (ret) { + if (need_abort) + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + } + + /* Update inode version and ctime/mtime. */ + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); + inode_inc_iversion(old_inode); + inode_inc_iversion(new_inode); + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + + if (old_dentry->d_parent != new_dentry->d_parent) { + btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), + BTRFS_I(old_inode), true); + btrfs_record_unlink_dir(trans, BTRFS_I(new_dir), + BTRFS_I(new_inode), true); + } + + /* src is a subvolume */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); + } else { /* src is an inode */ + ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), + BTRFS_I(old_dentry->d_inode), + old_name, &old_rename_ctx); + if (!ret) + ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); + } + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + + /* dest is a subvolume */ + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); + } else { /* dest is an inode */ + ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), + BTRFS_I(new_dentry->d_inode), + new_name, &new_rename_ctx); + if (!ret) + ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); + } + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + + ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), + new_name, 0, old_idx); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + + ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), + old_name, 0, new_idx); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + + if (old_inode->i_nlink == 1) + BTRFS_I(old_inode)->dir_index = old_idx; + if (new_inode->i_nlink == 1) + BTRFS_I(new_inode)->dir_index = new_idx; + + /* + * Now pin the logs of the roots. We do it to ensure that no other task + * can sync the logs while we are in progress with the rename, because + * that could result in an inconsistency in case any of the inodes that + * are part of this rename operation were logged before. + */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_pin_log_trans(root); + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_pin_log_trans(dest); + + /* Do the log updates for all inodes. */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), + old_rename_ctx.index, new_dentry->d_parent); + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), + new_rename_ctx.index, old_dentry->d_parent); + + /* Now unpin the logs. */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_end_log_trans(root); + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_end_log_trans(dest); +out_fail: + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; +out_notrans: + if (new_ino == BTRFS_FIRST_FREE_OBJECTID || + old_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&fs_info->subvol_sem); + + fscrypt_free_filename(&new_fname); + fscrypt_free_filename(&old_fname); + return ret; +} + +static struct inode *new_whiteout_inode(struct mnt_idmap *idmap, + struct inode *dir) +{ + struct inode *inode; + + inode = new_inode(dir->i_sb); + if (inode) { + inode_init_owner(idmap, inode, dir, + S_IFCHR | WHITEOUT_MODE); + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); + } + return inode; +} + +static int btrfs_rename(struct mnt_idmap *idmap, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); + struct btrfs_new_inode_args whiteout_args = { + .dir = old_dir, + .dentry = old_dentry, + }; + struct btrfs_trans_handle *trans; + unsigned int trans_num_items; + struct btrfs_root *root = BTRFS_I(old_dir)->root; + struct btrfs_root *dest = BTRFS_I(new_dir)->root; + struct inode *new_inode = d_inode(new_dentry); + struct inode *old_inode = d_inode(old_dentry); + struct btrfs_rename_ctx rename_ctx; + u64 index = 0; + int ret; + int ret2; + u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); + struct fscrypt_name old_fname, new_fname; + + if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return -EPERM; + + /* we only allow rename subvolume link between subvolumes */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) + return -EXDEV; + + if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || + (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID)) + return -ENOTEMPTY; + + if (S_ISDIR(old_inode->i_mode) && new_inode && + new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) + return -ENOTEMPTY; + + ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); + if (ret) + return ret; + + ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); + if (ret) { + fscrypt_free_filename(&old_fname); + return ret; + } + + /* check for collisions, even if the name isn't there */ + ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name); + if (ret) { + if (ret == -EEXIST) { + /* we shouldn't get + * eexist without a new_inode */ + if (WARN_ON(!new_inode)) { + goto out_fscrypt_names; + } + } else { + /* maybe -EOVERFLOW */ + goto out_fscrypt_names; + } + } + ret = 0; + + /* + * we're using rename to replace one file with another. Start IO on it + * now so we don't add too much work to the end of the transaction + */ + if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) + filemap_flush(old_inode->i_mapping); + + if (flags & RENAME_WHITEOUT) { + whiteout_args.inode = new_whiteout_inode(idmap, old_dir); + if (!whiteout_args.inode) { + ret = -ENOMEM; + goto out_fscrypt_names; + } + ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items); + if (ret) + goto out_whiteout_inode; + } else { + /* 1 to update the old parent inode. */ + trans_num_items = 1; + } + + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* Close the race window with snapshot create/destroy ioctl */ + down_read(&fs_info->subvol_sem); + /* + * 1 to remove old root ref + * 1 to remove old root backref + * 1 to add new root ref + * 1 to add new root backref + */ + trans_num_items += 4; + } else { + /* + * 1 to update inode + * 1 to remove old inode ref + * 1 to add new inode ref + */ + trans_num_items += 3; + } + /* + * 1 to remove old dir item + * 1 to remove old dir index + * 1 to add new dir item + * 1 to add new dir index + */ + trans_num_items += 4; + /* 1 to update new parent inode if it's not the same as the old parent */ + if (new_dir != old_dir) + trans_num_items++; + if (new_inode) { + /* + * 1 to update inode + * 1 to remove inode ref + * 1 to remove dir item + * 1 to remove dir index + * 1 to possibly add orphan item + */ + trans_num_items += 5; + } + trans = btrfs_start_transaction(root, trans_num_items); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_notrans; + } + + if (dest != root) { + ret = btrfs_record_root_in_trans(trans, dest); + if (ret) + goto out_fail; + } + + ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index); + if (ret) + goto out_fail; + + BTRFS_I(old_inode)->dir_index = 0ULL; + if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { + /* force full log commit if subvolume involved. */ + btrfs_set_log_full_commit(trans); + } else { + ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name, + old_ino, btrfs_ino(BTRFS_I(new_dir)), + index); + if (ret) + goto out_fail; + } + + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); + inode_inc_iversion(old_inode); + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + + if (old_dentry->d_parent != new_dentry->d_parent) + btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), + BTRFS_I(old_inode), true); + + if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { + ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); + } else { + ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), + BTRFS_I(d_inode(old_dentry)), + &old_fname.disk_name, &rename_ctx); + if (!ret) + ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); + } + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + + if (new_inode) { + inode_inc_iversion(new_inode); + if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == + BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); + BUG_ON(new_inode->i_nlink == 0); + } else { + ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), + BTRFS_I(d_inode(new_dentry)), + &new_fname.disk_name); + } + if (!ret && new_inode->i_nlink == 0) + ret = btrfs_orphan_add(trans, + BTRFS_I(d_inode(new_dentry))); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + } + + ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), + &new_fname.disk_name, 0, index); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + + if (old_inode->i_nlink == 1) + BTRFS_I(old_inode)->dir_index = index; + + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), + rename_ctx.index, new_dentry->d_parent); + + if (flags & RENAME_WHITEOUT) { + ret = btrfs_create_new_inode(trans, &whiteout_args); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } else { + unlock_new_inode(whiteout_args.inode); + iput(whiteout_args.inode); + whiteout_args.inode = NULL; + } + } +out_fail: + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; +out_notrans: + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&fs_info->subvol_sem); + if (flags & RENAME_WHITEOUT) + btrfs_new_inode_args_destroy(&whiteout_args); +out_whiteout_inode: + if (flags & RENAME_WHITEOUT) + iput(whiteout_args.inode); +out_fscrypt_names: + fscrypt_free_filename(&old_fname); + fscrypt_free_filename(&new_fname); + return ret; +} + +static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) +{ + int ret; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) + ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir, + new_dentry); + else + ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir, + new_dentry, flags); + + btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info); + + return ret; +} + +struct btrfs_delalloc_work { + struct inode *inode; + struct completion completion; + struct list_head list; + struct btrfs_work work; +}; + +static void btrfs_run_delalloc_work(struct btrfs_work *work) +{ + struct btrfs_delalloc_work *delalloc_work; + struct inode *inode; + + delalloc_work = container_of(work, struct btrfs_delalloc_work, + work); + inode = delalloc_work->inode; + filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + + iput(inode); + complete(&delalloc_work->completion); +} + +static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode) +{ + struct btrfs_delalloc_work *work; + + work = kmalloc(sizeof(*work), GFP_NOFS); + if (!work) + return NULL; + + init_completion(&work->completion); + INIT_LIST_HEAD(&work->list); + work->inode = inode; + btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); + + return work; +} + +/* + * some fairly slow code that needs optimization. This walks the list + * of all the inodes with pending delalloc and forces them to disk. + */ +static int start_delalloc_inodes(struct btrfs_root *root, + struct writeback_control *wbc, bool snapshot, + bool in_reclaim_context) +{ + struct btrfs_inode *binode; + struct inode *inode; + struct btrfs_delalloc_work *work, *next; + LIST_HEAD(works); + LIST_HEAD(splice); + int ret = 0; + bool full_flush = wbc->nr_to_write == LONG_MAX; + + mutex_lock(&root->delalloc_mutex); + spin_lock(&root->delalloc_lock); + list_splice_init(&root->delalloc_inodes, &splice); + while (!list_empty(&splice)) { + binode = list_entry(splice.next, struct btrfs_inode, + delalloc_inodes); + + list_move_tail(&binode->delalloc_inodes, + &root->delalloc_inodes); + + if (in_reclaim_context && + test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) + continue; + + inode = igrab(&binode->vfs_inode); + if (!inode) { + cond_resched_lock(&root->delalloc_lock); + continue; + } + spin_unlock(&root->delalloc_lock); + + if (snapshot) + set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, + &binode->runtime_flags); + if (full_flush) { + work = btrfs_alloc_delalloc_work(inode); + if (!work) { + iput(inode); + ret = -ENOMEM; + goto out; + } + list_add_tail(&work->list, &works); + btrfs_queue_work(root->fs_info->flush_workers, + &work->work); + } else { + ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); + btrfs_add_delayed_iput(BTRFS_I(inode)); + if (ret || wbc->nr_to_write <= 0) + goto out; + } + cond_resched(); + spin_lock(&root->delalloc_lock); + } + spin_unlock(&root->delalloc_lock); + +out: + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + wait_for_completion(&work->completion); + kfree(work); + } + + if (!list_empty(&splice)) { + spin_lock(&root->delalloc_lock); + list_splice_tail(&splice, &root->delalloc_inodes); + spin_unlock(&root->delalloc_lock); + } + mutex_unlock(&root->delalloc_mutex); + return ret; +} + +int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) +{ + struct writeback_control wbc = { + .nr_to_write = LONG_MAX, + .sync_mode = WB_SYNC_NONE, + .range_start = 0, + .range_end = LLONG_MAX, + }; + struct btrfs_fs_info *fs_info = root->fs_info; + + if (BTRFS_FS_ERROR(fs_info)) + return -EROFS; + + return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); +} + +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, + bool in_reclaim_context) +{ + struct writeback_control wbc = { + .nr_to_write = nr, + .sync_mode = WB_SYNC_NONE, + .range_start = 0, + .range_end = LLONG_MAX, + }; + struct btrfs_root *root; + LIST_HEAD(splice); + int ret; + + if (BTRFS_FS_ERROR(fs_info)) + return -EROFS; + + mutex_lock(&fs_info->delalloc_root_mutex); + spin_lock(&fs_info->delalloc_root_lock); + list_splice_init(&fs_info->delalloc_roots, &splice); + while (!list_empty(&splice)) { + /* + * Reset nr_to_write here so we know that we're doing a full + * flush. + */ + if (nr == LONG_MAX) + wbc.nr_to_write = LONG_MAX; + + root = list_first_entry(&splice, struct btrfs_root, + delalloc_root); + root = btrfs_grab_root(root); + BUG_ON(!root); + list_move_tail(&root->delalloc_root, + &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); + + ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context); + btrfs_put_root(root); + if (ret < 0 || wbc.nr_to_write <= 0) + goto out; + spin_lock(&fs_info->delalloc_root_lock); + } + spin_unlock(&fs_info->delalloc_root_lock); + + ret = 0; +out: + if (!list_empty(&splice)) { + spin_lock(&fs_info->delalloc_root_lock); + list_splice_tail(&splice, &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); + } + mutex_unlock(&fs_info->delalloc_root_mutex); + return ret; +} + +static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, const char *symname) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_path *path; + struct btrfs_key key; + struct inode *inode; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + }; + unsigned int trans_num_items; + int err; + int name_len; + int datasize; + unsigned long ptr; + struct btrfs_file_extent_item *ei; + struct extent_buffer *leaf; + + name_len = strlen(symname); + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) + return -ENAMETOOLONG; + + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO); + inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &btrfs_aops; + btrfs_i_size_write(BTRFS_I(inode), name_len); + inode_set_bytes(inode, name_len); + + new_inode_args.inode = inode; + err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); + if (err) + goto out_inode; + /* 1 additional item for the inline extent */ + trans_num_items++; + + trans = btrfs_start_transaction(root, trans_num_items); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_new_inode_args; + } + + err = btrfs_create_new_inode(trans, &new_inode_args); + if (err) + goto out; + + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + btrfs_abort_transaction(trans, err); + discard_new_inode(inode); + inode = NULL; + goto out; + } + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.offset = 0; + key.type = BTRFS_EXTENT_DATA_KEY; + datasize = btrfs_file_extent_calc_inline_size(name_len); + err = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + if (err) { + btrfs_abort_transaction(trans, err); + btrfs_free_path(path); + discard_new_inode(inode); + inode = NULL; + goto out; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, + BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_compression(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); + + ptr = btrfs_file_extent_inline_start(ei); + write_extent_buffer(leaf, symname, ptr, name_len); + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_free_path(path); + + d_instantiate_new(dentry, inode); + err = 0; +out: + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + if (err) + iput(inode); + return err; +} + +static struct btrfs_trans_handle *insert_prealloc_file_extent( + struct btrfs_trans_handle *trans_in, + struct btrfs_inode *inode, + struct btrfs_key *ins, + u64 file_offset) +{ + struct btrfs_file_extent_item stack_fi; + struct btrfs_replace_extent_info extent_info; + struct btrfs_trans_handle *trans = trans_in; + struct btrfs_path *path; + u64 start = ins->objectid; + u64 len = ins->offset; + u64 qgroup_released = 0; + int ret; + + memset(&stack_fi, 0, sizeof(stack_fi)); + + btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC); + btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start); + btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len); + btrfs_set_stack_file_extent_num_bytes(&stack_fi, len); + btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len); + btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); + /* Encryption and other encoding is reserved and all 0 */ + + ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released); + if (ret < 0) + return ERR_PTR(ret); + + if (trans) { + ret = insert_reserved_file_extent(trans, inode, + file_offset, &stack_fi, + true, qgroup_released); + if (ret) + goto free_qgroup; + return trans; + } + + extent_info.disk_offset = start; + extent_info.disk_len = len; + extent_info.data_offset = 0; + extent_info.data_len = len; + extent_info.file_offset = file_offset; + extent_info.extent_buf = (char *)&stack_fi; + extent_info.is_new_extent = true; + extent_info.update_times = true; + extent_info.qgroup_reserved = qgroup_released; + extent_info.insertions = 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto free_qgroup; + } + + ret = btrfs_replace_file_extents(inode, path, file_offset, + file_offset + len - 1, &extent_info, + &trans); + btrfs_free_path(path); + if (ret) + goto free_qgroup; + return trans; + +free_qgroup: + /* + * We have released qgroup data range at the beginning of the function, + * and normally qgroup_released bytes will be freed when committing + * transaction. + * But if we error out early, we have to free what we have released + * or we leak qgroup data reservation. + */ + btrfs_qgroup_free_refroot(inode->root->fs_info, + inode->root->root_key.objectid, qgroup_released, + BTRFS_QGROUP_RSV_DATA); + return ERR_PTR(ret); +} + +static int __btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint, + struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key ins; + u64 cur_offset = start; + u64 clear_offset = start; + u64 i_size; + u64 cur_bytes; + u64 last_alloc = (u64)-1; + int ret = 0; + bool own_trans = true; + u64 end = start + num_bytes - 1; + + if (trans) + own_trans = false; + while (num_bytes > 0) { + cur_bytes = min_t(u64, num_bytes, SZ_256M); + cur_bytes = max(cur_bytes, min_size); + /* + * If we are severely fragmented we could end up with really + * small allocations, so if the allocator is returning small + * chunks lets make its job easier by only searching for those + * sized chunks. + */ + cur_bytes = min(cur_bytes, last_alloc); + ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, + min_size, 0, *alloc_hint, &ins, 1, 0); + if (ret) + break; + + /* + * We've reserved this space, and thus converted it from + * ->bytes_may_use to ->bytes_reserved. Any error that happens + * from here on out we will only need to clear our reservation + * for the remaining unreserved area, so advance our + * clear_offset by our extent size. + */ + clear_offset += ins.offset; + + last_alloc = ins.offset; + trans = insert_prealloc_file_extent(trans, BTRFS_I(inode), + &ins, cur_offset); + /* + * Now that we inserted the prealloc extent we can finally + * decrement the number of reservations in the block group. + * If we did it before, we could race with relocation and have + * relocation miss the reserved extent, making it fail later. + */ + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_free_reserved_extent(fs_info, ins.objectid, + ins.offset, 0); + break; + } + + em = alloc_extent_map(); + if (!em) { + btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset, + cur_offset + ins.offset - 1, false); + btrfs_set_inode_full_sync(BTRFS_I(inode)); + goto next; + } + + em->start = cur_offset; + em->orig_start = cur_offset; + em->len = ins.offset; + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->orig_block_len = ins.offset; + em->ram_bytes = ins.offset; + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + em->generation = trans->transid; + + ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); + free_extent_map(em); +next: + num_bytes -= ins.offset; + cur_offset += ins.offset; + *alloc_hint = ins.objectid + ins.offset; + + inode_inc_iversion(inode); + inode_set_ctime_current(inode); + BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (actual_len > inode->i_size) && + (cur_offset > inode->i_size)) { + if (cur_offset > actual_len) + i_size = actual_len; + else + i_size = cur_offset; + i_size_write(inode, i_size); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); + } + + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + + if (ret) { + btrfs_abort_transaction(trans, ret); + if (own_trans) + btrfs_end_transaction(trans); + break; + } + + if (own_trans) { + btrfs_end_transaction(trans); + trans = NULL; + } + } + if (clear_offset < end) + btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, + end - clear_offset + 1); + return ret; +} + +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint) +{ + return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, + min_size, actual_len, alloc_hint, + NULL); +} + +int btrfs_prealloc_file_range_trans(struct inode *inode, + struct btrfs_trans_handle *trans, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint) +{ + return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, + min_size, actual_len, alloc_hint, trans); +} + +static int btrfs_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + umode_t mode = inode->i_mode; + + if (mask & MAY_WRITE && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { + if (btrfs_root_readonly(root)) + return -EROFS; + if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) + return -EACCES; + } + return generic_permission(idmap, inode, mask); +} + +static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, + struct file *file, umode_t mode) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = file->f_path.dentry, + .orphan = true, + }; + unsigned int trans_num_items; + int ret; + + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(idmap, inode, dir, mode); + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + inode->i_mapping->a_ops = &btrfs_aops; + + new_inode_args.inode = inode; + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); + if (ret) + goto out_inode; + + trans = btrfs_start_transaction(root, trans_num_items); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_new_inode_args; + } + + ret = btrfs_create_new_inode(trans, &new_inode_args); + + /* + * We set number of links to 0 in btrfs_create_new_inode(), and here we + * set it to 1 because d_tmpfile() will issue a warning if the count is + * 0, through: + * + * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() + */ + set_nlink(inode, 1); + + if (!ret) { + d_tmpfile(file, inode); + unlock_new_inode(inode); + mark_inode_dirty(inode); + } + + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + if (ret) + iput(inode); + return finish_open_simple(file, ret); +} + +void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; + struct page *page; + u32 len; + + ASSERT(end + 1 - start <= U32_MAX); + len = end + 1 - start; + while (index <= end_index) { + page = find_get_page(inode->vfs_inode.i_mapping, index); + ASSERT(page); /* Pages should be in the extent_io_tree */ + + btrfs_page_set_writeback(fs_info, page, start, len); + put_page(page); + index++; + } +} + +int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, + int compress_type) +{ + switch (compress_type) { + case BTRFS_COMPRESS_NONE: + return BTRFS_ENCODED_IO_COMPRESSION_NONE; + case BTRFS_COMPRESS_ZLIB: + return BTRFS_ENCODED_IO_COMPRESSION_ZLIB; + case BTRFS_COMPRESS_LZO: + /* + * The LZO format depends on the sector size. 64K is the maximum + * sector size that we support. + */ + if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K) + return -EINVAL; + return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + + (fs_info->sectorsize_bits - 12); + case BTRFS_COMPRESS_ZSTD: + return BTRFS_ENCODED_IO_COMPRESSION_ZSTD; + default: + return -EUCLEAN; + } +} + +static ssize_t btrfs_encoded_read_inline( + struct kiocb *iocb, + struct iov_iter *iter, u64 start, + u64 lockend, + struct extent_state **cached_state, + u64 extent_start, size_t count, + struct btrfs_ioctl_encoded_io_args *encoded, + bool *unlocked) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *item; + u64 ram_bytes; + unsigned long ptr; + void *tmp; + ssize_t ret; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), + extent_start, 0); + if (ret) { + if (ret > 0) { + /* The extent item disappeared? */ + ret = -EIO; + } + goto out; + } + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); + ptr = btrfs_file_extent_inline_start(item); + + encoded->len = min_t(u64, extent_start + ram_bytes, + inode->vfs_inode.i_size) - iocb->ki_pos; + ret = btrfs_encoded_io_compression_from_extent(fs_info, + btrfs_file_extent_compression(leaf, item)); + if (ret < 0) + goto out; + encoded->compression = ret; + if (encoded->compression) { + size_t inline_size; + + inline_size = btrfs_file_extent_inline_item_len(leaf, + path->slots[0]); + if (inline_size > count) { + ret = -ENOBUFS; + goto out; + } + count = inline_size; + encoded->unencoded_len = ram_bytes; + encoded->unencoded_offset = iocb->ki_pos - extent_start; + } else { + count = min_t(u64, count, encoded->len); + encoded->len = count; + encoded->unencoded_len = count; + ptr += iocb->ki_pos - extent_start; + } + + tmp = kmalloc(count, GFP_NOFS); + if (!tmp) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(leaf, tmp, ptr, count); + btrfs_release_path(path); + unlock_extent(io_tree, start, lockend, cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + *unlocked = true; + + ret = copy_to_iter(tmp, count, iter); + if (ret != count) + ret = -EFAULT; + kfree(tmp); +out: + btrfs_free_path(path); + return ret; +} + +struct btrfs_encoded_read_private { + wait_queue_head_t wait; + atomic_t pending; + blk_status_t status; +}; + +static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) +{ + struct btrfs_encoded_read_private *priv = bbio->private; + + if (bbio->bio.bi_status) { + /* + * The memory barrier implied by the atomic_dec_return() here + * pairs with the memory barrier implied by the + * atomic_dec_return() or io_wait_event() in + * btrfs_encoded_read_regular_fill_pages() to ensure that this + * write is observed before the load of status in + * btrfs_encoded_read_regular_fill_pages(). + */ + WRITE_ONCE(priv->status, bbio->bio.bi_status); + } + if (!atomic_dec_return(&priv->pending)) + wake_up(&priv->wait); + bio_put(&bbio->bio); +} + +int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, struct page **pages) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_encoded_read_private priv = { + .pending = ATOMIC_INIT(1), + }; + unsigned long i = 0; + struct btrfs_bio *bbio; + + init_waitqueue_head(&priv.wait); + + bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, + btrfs_encoded_read_endio, &priv); + bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + bbio->inode = inode; + + do { + size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); + + if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { + atomic_inc(&priv.pending); + btrfs_submit_bio(bbio, 0); + + bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, + btrfs_encoded_read_endio, &priv); + bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + bbio->inode = inode; + continue; + } + + i++; + disk_bytenr += bytes; + disk_io_size -= bytes; + } while (disk_io_size); + + atomic_inc(&priv.pending); + btrfs_submit_bio(bbio, 0); + + if (atomic_dec_return(&priv.pending)) + io_wait_event(priv.wait, !atomic_read(&priv.pending)); + /* See btrfs_encoded_read_endio() for ordering. */ + return blk_status_to_errno(READ_ONCE(priv.status)); +} + +static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, + struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state **cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, + bool *unlocked) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + struct page **pages; + unsigned long nr_pages, i; + u64 cur; + size_t page_offset; + ssize_t ret; + + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + ret = btrfs_alloc_page_array(nr_pages, pages); + if (ret) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, + disk_io_size, pages); + if (ret) + goto out; + + unlock_extent(io_tree, start, lockend, cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + *unlocked = true; + + if (compressed) { + i = 0; + page_offset = 0; + } else { + i = (iocb->ki_pos - start) >> PAGE_SHIFT; + page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1); + } + cur = 0; + while (cur < count) { + size_t bytes = min_t(size_t, count - cur, + PAGE_SIZE - page_offset); + + if (copy_page_to_iter(pages[i], page_offset, bytes, + iter) != bytes) { + ret = -EFAULT; + goto out; + } + i++; + cur += bytes; + page_offset = 0; + } + ret = count; +out: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kfree(pages); + return ret; +} + +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, + struct btrfs_ioctl_encoded_io_args *encoded) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + ssize_t ret; + size_t count = iov_iter_count(iter); + u64 start, lockend, disk_bytenr, disk_io_size; + struct extent_state *cached_state = NULL; + struct extent_map *em; + bool unlocked = false; + + file_accessed(iocb->ki_filp); + + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); + + if (iocb->ki_pos >= inode->vfs_inode.i_size) { + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + return 0; + } + start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize); + /* + * We don't know how long the extent containing iocb->ki_pos is, but if + * it's compressed we know that it won't be longer than this. + */ + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, + lockend - start + 1); + if (ret) + goto out_unlock_inode; + lock_extent(io_tree, start, lockend, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, + lockend - start + 1); + if (!ordered) + break; + btrfs_put_ordered_extent(ordered); + unlock_extent(io_tree, start, lockend, &cached_state); + cond_resched(); + } + + em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_unlock_extent; + } + + if (em->block_start == EXTENT_MAP_INLINE) { + u64 extent_start = em->start; + + /* + * For inline extents we get everything we need out of the + * extent item. + */ + free_extent_map(em); + em = NULL; + ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, + &cached_state, extent_start, + count, encoded, &unlocked); + goto out; + } + + /* + * We only want to return up to EOF even if the extent extends beyond + * that. + */ + encoded->len = min_t(u64, extent_map_end(em), + inode->vfs_inode.i_size) - iocb->ki_pos; + if (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + disk_bytenr = EXTENT_MAP_HOLE; + count = min_t(u64, count, encoded->len); + encoded->len = count; + encoded->unencoded_len = count; + } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + disk_bytenr = em->block_start; + /* + * Bail if the buffer isn't large enough to return the whole + * compressed extent. + */ + if (em->block_len > count) { + ret = -ENOBUFS; + goto out_em; + } + disk_io_size = em->block_len; + count = em->block_len; + encoded->unencoded_len = em->ram_bytes; + encoded->unencoded_offset = iocb->ki_pos - em->orig_start; + ret = btrfs_encoded_io_compression_from_extent(fs_info, + em->compress_type); + if (ret < 0) + goto out_em; + encoded->compression = ret; + } else { + disk_bytenr = em->block_start + (start - em->start); + if (encoded->len > count) + encoded->len = count; + /* + * Don't read beyond what we locked. This also limits the page + * allocations that we'll do. + */ + disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; + count = start + disk_io_size - iocb->ki_pos; + encoded->len = count; + encoded->unencoded_len = count; + disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); + } + free_extent_map(em); + em = NULL; + + if (disk_bytenr == EXTENT_MAP_HOLE) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + unlocked = true; + ret = iov_iter_zero(count, iter); + if (ret != count) + ret = -EFAULT; + } else { + ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, + &cached_state, disk_bytenr, + disk_io_size, count, + encoded->compression, + &unlocked); + } + +out: + if (ret >= 0) + iocb->ki_pos += encoded->len; +out_em: + free_extent_map(em); +out_unlock_extent: + if (!unlocked) + unlock_extent(io_tree, start, lockend, &cached_state); +out_unlock_inode: + if (!unlocked) + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + return ret; +} + +ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct extent_changeset *data_reserved = NULL; + struct extent_state *cached_state = NULL; + struct btrfs_ordered_extent *ordered; + int compression; + size_t orig_count; + u64 start, end; + u64 num_bytes, ram_bytes, disk_num_bytes; + unsigned long nr_pages, i; + struct page **pages; + struct btrfs_key ins; + bool extent_reserved = false; + struct extent_map *em; + ssize_t ret; + + switch (encoded->compression) { + case BTRFS_ENCODED_IO_COMPRESSION_ZLIB: + compression = BTRFS_COMPRESS_ZLIB; + break; + case BTRFS_ENCODED_IO_COMPRESSION_ZSTD: + compression = BTRFS_COMPRESS_ZSTD; + break; + case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K: + /* The sector size must match for LZO. */ + if (encoded->compression - + BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 != + fs_info->sectorsize_bits) + return -EINVAL; + compression = BTRFS_COMPRESS_LZO; + break; + default: + return -EINVAL; + } + if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE) + return -EINVAL; + + orig_count = iov_iter_count(from); + + /* The extent size must be sane. */ + if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED || + orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0) + return -EINVAL; + + /* + * The compressed data must be smaller than the decompressed data. + * + * It's of course possible for data to compress to larger or the same + * size, but the buffered I/O path falls back to no compression for such + * data, and we don't want to break any assumptions by creating these + * extents. + * + * Note that this is less strict than the current check we have that the + * compressed data must be at least one sector smaller than the + * decompressed data. We only want to enforce the weaker requirement + * from old kernels that it is at least one byte smaller. + */ + if (orig_count >= encoded->unencoded_len) + return -EINVAL; + + /* The extent must start on a sector boundary. */ + start = iocb->ki_pos; + if (!IS_ALIGNED(start, fs_info->sectorsize)) + return -EINVAL; + + /* + * The extent must end on a sector boundary. However, we allow a write + * which ends at or extends i_size to have an unaligned length; we round + * up the extent size and set i_size to the unaligned end. + */ + if (start + encoded->len < inode->vfs_inode.i_size && + !IS_ALIGNED(start + encoded->len, fs_info->sectorsize)) + return -EINVAL; + + /* Finally, the offset in the unencoded data must be sector-aligned. */ + if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize)) + return -EINVAL; + + num_bytes = ALIGN(encoded->len, fs_info->sectorsize); + ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize); + end = start + num_bytes - 1; + + /* + * If the extent cannot be inline, the compressed data on disk must be + * sector-aligned. For convenience, we extend it with zeroes if it + * isn't. + */ + disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); + nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); + pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); + if (!pages) + return -ENOMEM; + for (i = 0; i < nr_pages; i++) { + size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); + char *kaddr; + + pages[i] = alloc_page(GFP_KERNEL_ACCOUNT); + if (!pages[i]) { + ret = -ENOMEM; + goto out_pages; + } + kaddr = kmap_local_page(pages[i]); + if (copy_from_iter(kaddr, bytes, from) != bytes) { + kunmap_local(kaddr); + ret = -EFAULT; + goto out_pages; + } + if (bytes < PAGE_SIZE) + memset(kaddr + bytes, 0, PAGE_SIZE - bytes); + kunmap_local(kaddr); + } + + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes); + if (ret) + goto out_pages; + ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, + start >> PAGE_SHIFT, + end >> PAGE_SHIFT); + if (ret) + goto out_pages; + lock_extent(io_tree, start, end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); + if (!ordered && + !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) + break; + if (ordered) + btrfs_put_ordered_extent(ordered); + unlock_extent(io_tree, start, end, &cached_state); + cond_resched(); + } + + /* + * We don't use the higher-level delalloc space functions because our + * num_bytes and disk_num_bytes are different. + */ + ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes); + if (ret) + goto out_unlock; + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes); + if (ret) + goto out_free_data_space; + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes, + false); + if (ret) + goto out_qgroup_free_data; + + /* Try an inline extent first. */ + if (start == 0 && encoded->unencoded_len == encoded->len && + encoded->unencoded_offset == 0) { + ret = cow_file_range_inline(inode, encoded->len, orig_count, + compression, pages, true); + if (ret <= 0) { + if (ret == 0) + ret = orig_count; + goto out_delalloc_release; + } + } + + ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes, + disk_num_bytes, 0, 0, &ins, 1, 1); + if (ret) + goto out_delalloc_release; + extent_reserved = true; + + em = create_io_em(inode, start, num_bytes, + start - encoded->unencoded_offset, ins.objectid, + ins.offset, ins.offset, ram_bytes, compression, + BTRFS_ORDERED_COMPRESSED); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_free_reserved; + } + free_extent_map(em); + + ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes, + ins.objectid, ins.offset, + encoded->unencoded_offset, + (1 << BTRFS_ORDERED_ENCODED) | + (1 << BTRFS_ORDERED_COMPRESSED), + compression); + if (IS_ERR(ordered)) { + btrfs_drop_extent_map_range(inode, start, end, false); + ret = PTR_ERR(ordered); + goto out_free_reserved; + } + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + + if (start + encoded->len > inode->vfs_inode.i_size) + i_size_write(&inode->vfs_inode, start + encoded->len); + + unlock_extent(io_tree, start, end, &cached_state); + + btrfs_delalloc_release_extents(inode, num_bytes); + + btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false); + ret = orig_count; + goto out; + +out_free_reserved: + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); +out_delalloc_release: + btrfs_delalloc_release_extents(inode, num_bytes); + btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); +out_qgroup_free_data: + if (ret < 0) + btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL); +out_free_data_space: + /* + * If btrfs_reserve_extent() succeeded, then we already decremented + * bytes_may_use. + */ + if (!extent_reserved) + btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); +out_unlock: + unlock_extent(io_tree, start, end, &cached_state); +out_pages: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kvfree(pages); +out: + if (ret >= 0) + iocb->ki_pos += encoded->len; + return ret; +} + +#ifdef CONFIG_SWAP +/* + * Add an entry indicating a block group or device which is pinned by a + * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a + * negative errno on failure. + */ +static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, + bool is_block_group) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_swapfile_pin *sp, *entry; + struct rb_node **p; + struct rb_node *parent = NULL; + + sp = kmalloc(sizeof(*sp), GFP_NOFS); + if (!sp) + return -ENOMEM; + sp->ptr = ptr; + sp->inode = inode; + sp->is_block_group = is_block_group; + sp->bg_extent_count = 1; + + spin_lock(&fs_info->swapfile_pins_lock); + p = &fs_info->swapfile_pins.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_swapfile_pin, node); + if (sp->ptr < entry->ptr || + (sp->ptr == entry->ptr && sp->inode < entry->inode)) { + p = &(*p)->rb_left; + } else if (sp->ptr > entry->ptr || + (sp->ptr == entry->ptr && sp->inode > entry->inode)) { + p = &(*p)->rb_right; + } else { + if (is_block_group) + entry->bg_extent_count++; + spin_unlock(&fs_info->swapfile_pins_lock); + kfree(sp); + return 1; + } + } + rb_link_node(&sp->node, parent, p); + rb_insert_color(&sp->node, &fs_info->swapfile_pins); + spin_unlock(&fs_info->swapfile_pins_lock); + return 0; +} + +/* Free all of the entries pinned by this swapfile. */ +static void btrfs_free_swapfile_pins(struct inode *inode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_swapfile_pin *sp; + struct rb_node *node, *next; + + spin_lock(&fs_info->swapfile_pins_lock); + node = rb_first(&fs_info->swapfile_pins); + while (node) { + next = rb_next(node); + sp = rb_entry(node, struct btrfs_swapfile_pin, node); + if (sp->inode == inode) { + rb_erase(&sp->node, &fs_info->swapfile_pins); + if (sp->is_block_group) { + btrfs_dec_block_group_swap_extents(sp->ptr, + sp->bg_extent_count); + btrfs_put_block_group(sp->ptr); + } + kfree(sp); + } + node = next; + } + spin_unlock(&fs_info->swapfile_pins_lock); +} + +struct btrfs_swap_info { + u64 start; + u64 block_start; + u64 block_len; + u64 lowest_ppage; + u64 highest_ppage; + unsigned long nr_pages; + int nr_extents; +}; + +static int btrfs_add_swap_extent(struct swap_info_struct *sis, + struct btrfs_swap_info *bsi) +{ + unsigned long nr_pages; + unsigned long max_pages; + u64 first_ppage, first_ppage_reported, next_ppage; + int ret; + + /* + * Our swapfile may have had its size extended after the swap header was + * written. In that case activating the swapfile should not go beyond + * the max size set in the swap header. + */ + if (bsi->nr_pages >= sis->max) + return 0; + + max_pages = sis->max - bsi->nr_pages; + first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT; + next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT; + + if (first_ppage >= next_ppage) + return 0; + nr_pages = next_ppage - first_ppage; + nr_pages = min(nr_pages, max_pages); + + first_ppage_reported = first_ppage; + if (bsi->start == 0) + first_ppage_reported++; + if (bsi->lowest_ppage > first_ppage_reported) + bsi->lowest_ppage = first_ppage_reported; + if (bsi->highest_ppage < (next_ppage - 1)) + bsi->highest_ppage = next_ppage - 1; + + ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage); + if (ret < 0) + return ret; + bsi->nr_extents += ret; + bsi->nr_pages += nr_pages; + return 0; +} + +static void btrfs_swap_deactivate(struct file *file) +{ + struct inode *inode = file_inode(file); + + btrfs_free_swapfile_pins(inode); + atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); +} + +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_state *cached_state = NULL; + struct extent_map *em = NULL; + struct btrfs_device *device = NULL; + struct btrfs_swap_info bsi = { + .lowest_ppage = (sector_t)-1ULL, + }; + int ret = 0; + u64 isize; + u64 start; + + /* + * If the swap file was just created, make sure delalloc is done. If the + * file changes again after this, the user is doing something stupid and + * we don't really care. + */ + ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (ret) + return ret; + + /* + * The inode is locked, so these flags won't change after we check them. + */ + if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { + btrfs_warn(fs_info, "swapfile must not be compressed"); + return -EINVAL; + } + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { + btrfs_warn(fs_info, "swapfile must not be copy-on-write"); + return -EINVAL; + } + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + btrfs_warn(fs_info, "swapfile must not be checksummed"); + return -EINVAL; + } + + /* + * Balance or device remove/replace/resize can move stuff around from + * under us. The exclop protection makes sure they aren't running/won't + * run concurrently while we are mapping the swap extents, and + * fs_info->swapfile_pins prevents them from running while the swap + * file is active and moving the extents. Note that this also prevents + * a concurrent device add which isn't actually necessary, but it's not + * really worth the trouble to allow it. + */ + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { + btrfs_warn(fs_info, + "cannot activate swapfile while exclusive operation is running"); + return -EBUSY; + } + + /* + * Prevent snapshot creation while we are activating the swap file. + * We do not want to race with snapshot creation. If snapshot creation + * already started before we bumped nr_swapfiles from 0 to 1 and + * completes before the first write into the swap file after it is + * activated, than that write would fallback to COW. + */ + if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) { + btrfs_exclop_finish(fs_info); + btrfs_warn(fs_info, + "cannot activate swapfile because snapshot creation is in progress"); + return -EINVAL; + } + /* + * Snapshots can create extents which require COW even if NODATACOW is + * set. We use this counter to prevent snapshots. We must increment it + * before walking the extents because we don't want a concurrent + * snapshot to run after we've already checked the extents. + * + * It is possible that subvolume is marked for deletion but still not + * removed yet. To prevent this race, we check the root status before + * activating the swapfile. + */ + spin_lock(&root->root_item_lock); + if (btrfs_root_dead(root)) { + spin_unlock(&root->root_item_lock); + + btrfs_exclop_finish(fs_info); + btrfs_warn(fs_info, + "cannot activate swapfile because subvolume %llu is being deleted", + root->root_key.objectid); + return -EPERM; + } + atomic_inc(&root->nr_swapfiles); + spin_unlock(&root->root_item_lock); + + isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); + + lock_extent(io_tree, 0, isize - 1, &cached_state); + start = 0; + while (start < isize) { + u64 logical_block_start, physical_block_start; + struct btrfs_block_group *bg; + u64 len = isize - start; + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (em->block_start == EXTENT_MAP_HOLE) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } + if (em->block_start == EXTENT_MAP_INLINE) { + /* + * It's unlikely we'll ever actually find ourselves + * here, as a file small enough to fit inline won't be + * big enough to store more than the swap header, but in + * case something changes in the future, let's catch it + * here rather than later. + */ + btrfs_warn(fs_info, "swapfile must not be inline"); + ret = -EINVAL; + goto out; + } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + btrfs_warn(fs_info, "swapfile must not be compressed"); + ret = -EINVAL; + goto out; + } + + logical_block_start = em->block_start + (start - em->start); + len = min(len, em->len - (start - em->start)); + free_extent_map(em); + em = NULL; + + ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true); + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + } else { + btrfs_warn(fs_info, + "swapfile must not be copy-on-write"); + ret = -EINVAL; + goto out; + } + + em = btrfs_get_chunk_map(fs_info, logical_block_start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + btrfs_warn(fs_info, + "swapfile must have single data profile"); + ret = -EINVAL; + goto out; + } + + if (device == NULL) { + device = em->map_lookup->stripes[0].dev; + ret = btrfs_add_swapfile_pin(inode, device, false); + if (ret == 1) + ret = 0; + else if (ret) + goto out; + } else if (device != em->map_lookup->stripes[0].dev) { + btrfs_warn(fs_info, "swapfile must be on one device"); + ret = -EINVAL; + goto out; + } + + physical_block_start = (em->map_lookup->stripes[0].physical + + (logical_block_start - em->start)); + len = min(len, em->len - (logical_block_start - em->start)); + free_extent_map(em); + em = NULL; + + bg = btrfs_lookup_block_group(fs_info, logical_block_start); + if (!bg) { + btrfs_warn(fs_info, + "could not find block group containing swapfile"); + ret = -EINVAL; + goto out; + } + + if (!btrfs_inc_block_group_swap_extents(bg)) { + btrfs_warn(fs_info, + "block group for swapfile at %llu is read-only%s", + bg->start, + atomic_read(&fs_info->scrubs_running) ? + " (scrub running)" : ""); + btrfs_put_block_group(bg); + ret = -EINVAL; + goto out; + } + + ret = btrfs_add_swapfile_pin(inode, bg, true); + if (ret) { + btrfs_put_block_group(bg); + if (ret == 1) + ret = 0; + else + goto out; + } + + if (bsi.block_len && + bsi.block_start + bsi.block_len == physical_block_start) { + bsi.block_len += len; + } else { + if (bsi.block_len) { + ret = btrfs_add_swap_extent(sis, &bsi); + if (ret) + goto out; + } + bsi.start = start; + bsi.block_start = physical_block_start; + bsi.block_len = len; + } + + start += len; + } + + if (bsi.block_len) + ret = btrfs_add_swap_extent(sis, &bsi); + +out: + if (!IS_ERR_OR_NULL(em)) + free_extent_map(em); + + unlock_extent(io_tree, 0, isize - 1, &cached_state); + + if (ret) + btrfs_swap_deactivate(file); + + btrfs_drew_write_unlock(&root->snapshot_lock); + + btrfs_exclop_finish(fs_info); + + if (ret) + return ret; + + if (device) + sis->bdev = device->bdev; + *span = bsi.highest_ppage - bsi.lowest_ppage + 1; + sis->max = bsi.nr_pages; + sis->pages = bsi.nr_pages - 1; + sis->highest_bit = bsi.nr_pages - 1; + return bsi.nr_extents; +} +#else +static void btrfs_swap_deactivate(struct file *file) +{ +} + +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + return -EOPNOTSUPP; +} +#endif + +/* + * Update the number of bytes used in the VFS' inode. When we replace extents in + * a range (clone, dedupe, fallocate's zero range), we must update the number of + * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls + * always get a correct value. + */ +void btrfs_update_inode_bytes(struct btrfs_inode *inode, + const u64 add_bytes, + const u64 del_bytes) +{ + if (add_bytes == del_bytes) + return; + + spin_lock(&inode->lock); + if (del_bytes > 0) + inode_sub_bytes(&inode->vfs_inode, del_bytes); + if (add_bytes > 0) + inode_add_bytes(&inode->vfs_inode, add_bytes); + spin_unlock(&inode->lock); +} + +/* + * Verify that there are no ordered extents for a given file range. + * + * @inode: The target inode. + * @start: Start offset of the file range, should be sector size aligned. + * @end: End offset (inclusive) of the file range, its value +1 should be + * sector size aligned. + * + * This should typically be used for cases where we locked an inode's VFS lock in + * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode, + * we have flushed all delalloc in the range, we have waited for all ordered + * extents in the range to complete and finally we have locked the file range in + * the inode's io_tree. + */ +void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end) +{ + struct btrfs_root *root = inode->root; + struct btrfs_ordered_extent *ordered; + + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start); + if (ordered) { + btrfs_err(root->fs_info, +"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])", + start, end, btrfs_ino(inode), root->root_key.objectid, + ordered->file_offset, + ordered->file_offset + ordered->num_bytes - 1); + btrfs_put_ordered_extent(ordered); + } + + ASSERT(ordered == NULL); +} + +static const struct inode_operations btrfs_dir_inode_operations = { + .getattr = btrfs_getattr, + .lookup = btrfs_lookup, + .create = btrfs_create, + .unlink = btrfs_unlink, + .link = btrfs_link, + .mkdir = btrfs_mkdir, + .rmdir = btrfs_rmdir, + .rename = btrfs_rename2, + .symlink = btrfs_symlink, + .setattr = btrfs_setattr, + .mknod = btrfs_mknod, + .listxattr = btrfs_listxattr, + .permission = btrfs_permission, + .get_inode_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, + .update_time = btrfs_update_time, + .tmpfile = btrfs_tmpfile, + .fileattr_get = btrfs_fileattr_get, + .fileattr_set = btrfs_fileattr_set, +}; + +static const struct file_operations btrfs_dir_file_operations = { + .llseek = btrfs_dir_llseek, + .read = generic_read_dir, + .iterate_shared = btrfs_real_readdir, + .open = btrfs_opendir, + .unlocked_ioctl = btrfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = btrfs_compat_ioctl, +#endif + .release = btrfs_release_file, + .fsync = btrfs_sync_file, +}; + +/* + * btrfs doesn't support the bmap operation because swapfiles + * use bmap to make a mapping of extents in the file. They assume + * these extents won't change over the life of the file and they + * use the bmap result to do IO directly to the drive. + * + * the btrfs bmap call would return logical addresses that aren't + * suitable for IO and they also will change frequently as COW + * operations happen. So, swapfile + btrfs == corruption. + * + * For now we're avoiding this by dropping bmap. + */ +static const struct address_space_operations btrfs_aops = { + .read_folio = btrfs_read_folio, + .writepages = btrfs_writepages, + .readahead = btrfs_readahead, + .invalidate_folio = btrfs_invalidate_folio, + .release_folio = btrfs_release_folio, + .migrate_folio = btrfs_migrate_folio, + .dirty_folio = filemap_dirty_folio, + .error_remove_page = generic_error_remove_page, + .swap_activate = btrfs_swap_activate, + .swap_deactivate = btrfs_swap_deactivate, +}; + +static const struct inode_operations btrfs_file_inode_operations = { + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, + .listxattr = btrfs_listxattr, + .permission = btrfs_permission, + .fiemap = btrfs_fiemap, + .get_inode_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, + .update_time = btrfs_update_time, + .fileattr_get = btrfs_fileattr_get, + .fileattr_set = btrfs_fileattr_set, +}; +static const struct inode_operations btrfs_special_inode_operations = { + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, + .permission = btrfs_permission, + .listxattr = btrfs_listxattr, + .get_inode_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, + .update_time = btrfs_update_time, +}; +static const struct inode_operations btrfs_symlink_inode_operations = { + .get_link = page_get_link, + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, + .permission = btrfs_permission, + .listxattr = btrfs_listxattr, + .update_time = btrfs_update_time, +}; + +const struct dentry_operations btrfs_dentry_operations = { + .d_delete = btrfs_dentry_delete, +}; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c new file mode 100644 index 0000000000..908215928d --- /dev/null +++ b/fs/btrfs/ioctl.c @@ -0,0 +1,4741 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "export.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "volumes.h" +#include "locking.h" +#include "backref.h" +#include "rcu-string.h" +#include "send.h" +#include "dev-replace.h" +#include "props.h" +#include "sysfs.h" +#include "qgroup.h" +#include "tree-log.h" +#include "compression.h" +#include "space-info.h" +#include "delalloc-space.h" +#include "block-group.h" +#include "subpage.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "defrag.h" +#include "dir-item.h" +#include "uuid-tree.h" +#include "ioctl.h" +#include "file.h" +#include "scrub.h" +#include "super.h" + +#ifdef CONFIG_64BIT +/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI + * structures are incorrect, as the timespec structure from userspace + * is 4 bytes too small. We define these alternatives here to teach + * the kernel about the 32-bit struct packing. + */ +struct btrfs_ioctl_timespec_32 { + __u64 sec; + __u32 nsec; +} __attribute__ ((__packed__)); + +struct btrfs_ioctl_received_subvol_args_32 { + char uuid[BTRFS_UUID_SIZE]; /* in */ + __u64 stransid; /* in */ + __u64 rtransid; /* out */ + struct btrfs_ioctl_timespec_32 stime; /* in */ + struct btrfs_ioctl_timespec_32 rtime; /* out */ + __u64 flags; /* in */ + __u64 reserved[16]; /* in */ +} __attribute__ ((__packed__)); + +#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \ + struct btrfs_ioctl_received_subvol_args_32) +#endif + +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) +struct btrfs_ioctl_send_args_32 { + __s64 send_fd; /* in */ + __u64 clone_sources_count; /* in */ + compat_uptr_t clone_sources; /* in */ + __u64 parent_root; /* in */ + __u64 flags; /* in */ + __u32 version; /* in */ + __u8 reserved[28]; /* in */ +} __attribute__ ((__packed__)); + +#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ + struct btrfs_ioctl_send_args_32) + +struct btrfs_ioctl_encoded_io_args_32 { + compat_uptr_t iov; + compat_ulong_t iovcnt; + __s64 offset; + __u64 flags; + __u64 len; + __u64 unencoded_len; + __u64 unencoded_offset; + __u32 compression; + __u32 encryption; + __u8 reserved[64]; +}; + +#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \ + struct btrfs_ioctl_encoded_io_args_32) +#define BTRFS_IOC_ENCODED_WRITE_32 _IOW(BTRFS_IOCTL_MAGIC, 64, \ + struct btrfs_ioctl_encoded_io_args_32) +#endif + +/* Mask out flags that are inappropriate for the given type of inode. */ +static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, + unsigned int flags) +{ + if (S_ISDIR(inode->i_mode)) + return flags; + else if (S_ISREG(inode->i_mode)) + return flags & ~FS_DIRSYNC_FL; + else + return flags & (FS_NODUMP_FL | FS_NOATIME_FL); +} + +/* + * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS + * ioctl. + */ +static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) +{ + unsigned int iflags = 0; + u32 flags = binode->flags; + u32 ro_flags = binode->ro_flags; + + if (flags & BTRFS_INODE_SYNC) + iflags |= FS_SYNC_FL; + if (flags & BTRFS_INODE_IMMUTABLE) + iflags |= FS_IMMUTABLE_FL; + if (flags & BTRFS_INODE_APPEND) + iflags |= FS_APPEND_FL; + if (flags & BTRFS_INODE_NODUMP) + iflags |= FS_NODUMP_FL; + if (flags & BTRFS_INODE_NOATIME) + iflags |= FS_NOATIME_FL; + if (flags & BTRFS_INODE_DIRSYNC) + iflags |= FS_DIRSYNC_FL; + if (flags & BTRFS_INODE_NODATACOW) + iflags |= FS_NOCOW_FL; + if (ro_flags & BTRFS_INODE_RO_VERITY) + iflags |= FS_VERITY_FL; + + if (flags & BTRFS_INODE_NOCOMPRESS) + iflags |= FS_NOCOMP_FL; + else if (flags & BTRFS_INODE_COMPRESS) + iflags |= FS_COMPR_FL; + + return iflags; +} + +/* + * Update inode->i_flags based on the btrfs internal flags. + */ +void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) +{ + struct btrfs_inode *binode = BTRFS_I(inode); + unsigned int new_fl = 0; + + if (binode->flags & BTRFS_INODE_SYNC) + new_fl |= S_SYNC; + if (binode->flags & BTRFS_INODE_IMMUTABLE) + new_fl |= S_IMMUTABLE; + if (binode->flags & BTRFS_INODE_APPEND) + new_fl |= S_APPEND; + if (binode->flags & BTRFS_INODE_NOATIME) + new_fl |= S_NOATIME; + if (binode->flags & BTRFS_INODE_DIRSYNC) + new_fl |= S_DIRSYNC; + if (binode->ro_flags & BTRFS_INODE_RO_VERITY) + new_fl |= S_VERITY; + + set_mask_bits(&inode->i_flags, + S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC | + S_VERITY, new_fl); +} + +/* + * Check if @flags are a supported and valid set of FS_*_FL flags and that + * the old and new flags are not conflicting + */ +static int check_fsflags(unsigned int old_flags, unsigned int flags) +{ + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL | FS_DIRSYNC_FL | \ + FS_NOCOMP_FL | FS_COMPR_FL | + FS_NOCOW_FL)) + return -EOPNOTSUPP; + + /* COMPR and NOCOMP on new/old are valid */ + if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) + return -EINVAL; + + if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL)) + return -EINVAL; + + /* NOCOW and compression options are mutually exclusive */ + if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL))) + return -EINVAL; + if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL))) + return -EINVAL; + + return 0; +} + +static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, + unsigned int flags) +{ + if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL)) + return -EPERM; + + return 0; +} + +/* + * Set flags/xflags from the internal inode flags. The remaining items of + * fsxattr are zeroed. + */ +int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct btrfs_inode *binode = BTRFS_I(d_inode(dentry)); + + fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode)); + return 0; +} + +int btrfs_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *binode = BTRFS_I(inode); + struct btrfs_root *root = binode->root; + struct btrfs_trans_handle *trans; + unsigned int fsflags, old_fsflags; + int ret; + const char *comp = NULL; + u32 binode_flags; + + if (btrfs_root_readonly(root)) + return -EROFS; + + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; + + fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags); + old_fsflags = btrfs_inode_flags_to_fsflags(binode); + ret = check_fsflags(old_fsflags, fsflags); + if (ret) + return ret; + + ret = check_fsflags_compatible(fs_info, fsflags); + if (ret) + return ret; + + binode_flags = binode->flags; + if (fsflags & FS_SYNC_FL) + binode_flags |= BTRFS_INODE_SYNC; + else + binode_flags &= ~BTRFS_INODE_SYNC; + if (fsflags & FS_IMMUTABLE_FL) + binode_flags |= BTRFS_INODE_IMMUTABLE; + else + binode_flags &= ~BTRFS_INODE_IMMUTABLE; + if (fsflags & FS_APPEND_FL) + binode_flags |= BTRFS_INODE_APPEND; + else + binode_flags &= ~BTRFS_INODE_APPEND; + if (fsflags & FS_NODUMP_FL) + binode_flags |= BTRFS_INODE_NODUMP; + else + binode_flags &= ~BTRFS_INODE_NODUMP; + if (fsflags & FS_NOATIME_FL) + binode_flags |= BTRFS_INODE_NOATIME; + else + binode_flags &= ~BTRFS_INODE_NOATIME; + + /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */ + if (!fa->flags_valid) { + /* 1 item for the inode */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + goto update_flags; + } + + if (fsflags & FS_DIRSYNC_FL) + binode_flags |= BTRFS_INODE_DIRSYNC; + else + binode_flags &= ~BTRFS_INODE_DIRSYNC; + if (fsflags & FS_NOCOW_FL) { + if (S_ISREG(inode->i_mode)) { + /* + * It's safe to turn csums off here, no extents exist. + * Otherwise we want the flag to reflect the real COW + * status of the file and will not set it. + */ + if (inode->i_size == 0) + binode_flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; + } else { + binode_flags |= BTRFS_INODE_NODATACOW; + } + } else { + /* + * Revert back under same assumptions as above + */ + if (S_ISREG(inode->i_mode)) { + if (inode->i_size == 0) + binode_flags &= ~(BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM); + } else { + binode_flags &= ~BTRFS_INODE_NODATACOW; + } + } + + /* + * The COMPRESS flag can only be changed by users, while the NOCOMPRESS + * flag may be changed automatically if compression code won't make + * things smaller. + */ + if (fsflags & FS_NOCOMP_FL) { + binode_flags &= ~BTRFS_INODE_COMPRESS; + binode_flags |= BTRFS_INODE_NOCOMPRESS; + } else if (fsflags & FS_COMPR_FL) { + + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + + binode_flags |= BTRFS_INODE_COMPRESS; + binode_flags &= ~BTRFS_INODE_NOCOMPRESS; + + comp = btrfs_compress_type2str(fs_info->compress_type); + if (!comp || comp[0] == 0) + comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB); + } else { + binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); + } + + /* + * 1 for inode item + * 2 for properties + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + if (comp) { + ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, + strlen(comp), 0); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } + } else { + ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, + 0, 0); + if (ret && ret != -ENODATA) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } + } + +update_flags: + binode->flags = binode_flags; + btrfs_sync_inode_flags_to_i_flags(inode); + inode_inc_iversion(inode); + inode_set_ctime_current(inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + + out_end_trans: + btrfs_end_transaction(trans); + return ret; +} + +/* + * Start exclusive operation @type, return true on success + */ +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + bool ret = false; + + spin_lock(&fs_info->super_lock); + if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) { + fs_info->exclusive_operation = type; + ret = true; + } + spin_unlock(&fs_info->super_lock); + + return ret; +} + +/* + * Conditionally allow to enter the exclusive operation in case it's compatible + * with the running one. This must be paired with btrfs_exclop_start_unlock and + * btrfs_exclop_finish. + * + * Compatibility: + * - the same type is already running + * - when trying to add a device and balance has been paused + * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller + * must check the condition first that would allow none -> @type + */ +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + spin_lock(&fs_info->super_lock); + if (fs_info->exclusive_operation == type || + (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED && + type == BTRFS_EXCLOP_DEV_ADD)) + return true; + + spin_unlock(&fs_info->super_lock); + return false; +} + +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info) +{ + spin_unlock(&fs_info->super_lock); +} + +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) +{ + spin_lock(&fs_info->super_lock); + WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); + spin_unlock(&fs_info->super_lock); + sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); +} + +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op) +{ + switch (op) { + case BTRFS_EXCLOP_BALANCE_PAUSED: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || + fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD || + fs_info->exclusive_operation == BTRFS_EXCLOP_NONE || + fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; + spin_unlock(&fs_info->super_lock); + break; + case BTRFS_EXCLOP_BALANCE: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; + spin_unlock(&fs_info->super_lock); + break; + default: + btrfs_warn(fs_info, + "invalid exclop balance operation %d requested", op); + } +} + +static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) +{ + return put_user(inode->i_generation, arg); +} + +static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, + void __user *arg) +{ + struct btrfs_device *device; + struct fstrim_range range; + u64 minlen = ULLONG_MAX; + u64 num_devices = 0; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * btrfs_trim_block_group() depends on space cache, which is not + * available in zoned filesystem. So, disallow fitrim on a zoned + * filesystem for now. + */ + if (btrfs_is_zoned(fs_info)) + return -EOPNOTSUPP; + + /* + * If the fs is mounted with nologreplay, which requires it to be + * mounted in RO mode as well, we can not allow discard on free space + * inside block groups, because log trees refer to extents that are not + * pinned in a block group's free space cache (pinning the extents is + * precisely the first phase of replaying a log tree). + */ + if (btrfs_test_opt(fs_info, NOLOGREPLAY)) + return -EROFS; + + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, + dev_list) { + if (!device->bdev || !bdev_max_discard_sectors(device->bdev)) + continue; + num_devices++; + minlen = min_t(u64, bdev_discard_granularity(device->bdev), + minlen); + } + rcu_read_unlock(); + + if (!num_devices) + return -EOPNOTSUPP; + if (copy_from_user(&range, arg, sizeof(range))) + return -EFAULT; + + /* + * NOTE: Don't truncate the range using super->total_bytes. Bytenr of + * block group is in the logical address space, which can be any + * sectorsize aligned bytenr in the range [0, U64_MAX]. + */ + if (range.len < fs_info->sb->s_blocksize) + return -EINVAL; + + range.minlen = max(range.minlen, minlen); + ret = btrfs_trim_fs(fs_info, &range); + if (ret < 0) + return ret; + + if (copy_to_user(arg, &range, sizeof(range))) + return -EFAULT; + + return 0; +} + +int __pure btrfs_is_empty_uuid(u8 *uuid) +{ + int i; + + for (i = 0; i < BTRFS_UUID_SIZE; i++) { + if (uuid[i]) + return 0; + } + return 1; +} + +/* + * Calculate the number of transaction items to reserve for creating a subvolume + * or snapshot, not including the inode, directory entries, or parent directory. + */ +static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit) +{ + /* + * 1 to add root block + * 1 to add root item + * 1 to add root ref + * 1 to add root backref + * 1 to add UUID item + * 1 to add qgroup info + * 1 to add qgroup limit + * + * Ideally the last two would only be accounted if qgroups are enabled, + * but that can change between now and the time we would insert them. + */ + unsigned int num_items = 7; + + if (inherit) { + /* 2 to add qgroup relations for each inherited qgroup */ + num_items += 2 * inherit->num_qgroups; + } + return num_items; +} + +static noinline int create_subvol(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *dentry, + struct btrfs_qgroup_inherit *inherit) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_root_item *root_item; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *new_root; + struct btrfs_block_rsv block_rsv; + struct timespec64 cur_time = current_time(dir); + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + .subvol = true, + }; + unsigned int trans_num_items; + int ret; + dev_t anon_dev; + u64 objectid; + + root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); + if (!root_item) + return -ENOMEM; + + ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid); + if (ret) + goto out_root_item; + + /* + * Don't create subvolume whose level is not zero. Or qgroup will be + * screwed up since it assumes subvolume qgroup's level to be 0. + */ + if (btrfs_qgroup_level(objectid)) { + ret = -ENOSPC; + goto out_root_item; + } + + ret = get_anon_bdev(&anon_dev); + if (ret < 0) + goto out_root_item; + + new_inode_args.inode = btrfs_new_subvol_inode(idmap, dir); + if (!new_inode_args.inode) { + ret = -ENOMEM; + goto out_anon_dev; + } + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); + if (ret) + goto out_inode; + trans_num_items += create_subvol_num_items(inherit); + + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, + trans_num_items, false); + if (ret) + goto out_new_inode_args; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_subvolume_release_metadata(root, &block_rsv); + goto out_new_inode_args; + } + trans->block_rsv = &block_rsv; + trans->bytes_reserved = block_rsv.size; + /* Tree log can't currently deal with an inode which is a new root. */ + btrfs_set_log_full_commit(trans); + + ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); + if (ret) + goto out; + + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, + BTRFS_NESTING_NORMAL); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + goto out; + } + + btrfs_mark_buffer_dirty(trans, leaf); + + inode_item = &root_item->inode; + btrfs_set_stack_inode_generation(inode_item, 1); + btrfs_set_stack_inode_size(inode_item, 3); + btrfs_set_stack_inode_nlink(inode_item, 1); + btrfs_set_stack_inode_nbytes(inode_item, + fs_info->nodesize); + btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); + + btrfs_set_root_flags(root_item, 0); + btrfs_set_root_limit(root_item, 0); + btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); + + btrfs_set_root_bytenr(root_item, leaf->start); + btrfs_set_root_generation(root_item, trans->transid); + btrfs_set_root_level(root_item, 0); + btrfs_set_root_refs(root_item, 1); + btrfs_set_root_used(root_item, leaf->len); + btrfs_set_root_last_snapshot(root_item, 0); + + btrfs_set_root_generation_v2(root_item, + btrfs_root_generation(root_item)); + generate_random_guid(root_item->uuid); + btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec); + btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec); + root_item->ctime = root_item->otime; + btrfs_set_root_ctransid(root_item, trans->transid); + btrfs_set_root_otransid(root_item, trans->transid); + + btrfs_tree_unlock(leaf); + + btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID); + + key.objectid = objectid; + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + ret = btrfs_insert_root(trans, fs_info->tree_root, &key, + root_item); + if (ret) { + /* + * Since we don't abort the transaction in this case, free the + * tree block so that we don't leak space and leave the + * filesystem in an inconsistent state (an extent item in the + * extent tree with a backreference for a root that does not + * exists). + */ + btrfs_tree_lock(leaf); + btrfs_clear_buffer_dirty(trans, leaf); + btrfs_tree_unlock(leaf); + btrfs_free_tree_block(trans, objectid, leaf, 0, 1); + free_extent_buffer(leaf); + goto out; + } + + free_extent_buffer(leaf); + leaf = NULL; + + new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); + if (IS_ERR(new_root)) { + ret = PTR_ERR(new_root); + btrfs_abort_transaction(trans, ret); + goto out; + } + /* anon_dev is owned by new_root now. */ + anon_dev = 0; + BTRFS_I(new_inode_args.inode)->root = new_root; + /* ... and new_root is owned by new_inode_args.inode now. */ + + ret = btrfs_record_root_in_trans(trans, new_root); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = btrfs_uuid_tree_add(trans, root_item->uuid, + BTRFS_UUID_KEY_SUBVOL, objectid); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = btrfs_create_new_inode(trans, &new_inode_args); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + d_instantiate_new(dentry, new_inode_args.inode); + new_inode_args.inode = NULL; + +out: + trans->block_rsv = NULL; + trans->bytes_reserved = 0; + btrfs_subvolume_release_metadata(root, &block_rsv); + + btrfs_end_transaction(trans); +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + iput(new_inode_args.inode); +out_anon_dev: + if (anon_dev) + free_anon_bdev(anon_dev); +out_root_item: + kfree(root_item); + return ret; +} + +static int create_snapshot(struct btrfs_root *root, struct inode *dir, + struct dentry *dentry, bool readonly, + struct btrfs_qgroup_inherit *inherit) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct inode *inode; + struct btrfs_pending_snapshot *pending_snapshot; + unsigned int trans_num_items; + struct btrfs_trans_handle *trans; + int ret; + + /* We do not support snapshotting right now. */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_warn(fs_info, + "extent tree v2 doesn't support snapshotting yet"); + return -EOPNOTSUPP; + } + + if (btrfs_root_refs(&root->root_item) == 0) + return -ENOENT; + + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + return -EINVAL; + + if (atomic_read(&root->nr_swapfiles)) { + btrfs_warn(fs_info, + "cannot snapshot subvolume with active swapfile"); + return -ETXTBSY; + } + + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL); + if (!pending_snapshot) + return -ENOMEM; + + ret = get_anon_bdev(&pending_snapshot->anon_dev); + if (ret < 0) + goto free_pending; + pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), + GFP_KERNEL); + pending_snapshot->path = btrfs_alloc_path(); + if (!pending_snapshot->root_item || !pending_snapshot->path) { + ret = -ENOMEM; + goto free_pending; + } + + btrfs_init_block_rsv(&pending_snapshot->block_rsv, + BTRFS_BLOCK_RSV_TEMP); + /* + * 1 to add dir item + * 1 to add dir index + * 1 to update parent inode item + */ + trans_num_items = create_subvol_num_items(inherit) + 3; + ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, + &pending_snapshot->block_rsv, + trans_num_items, false); + if (ret) + goto free_pending; + + pending_snapshot->dentry = dentry; + pending_snapshot->root = root; + pending_snapshot->readonly = readonly; + pending_snapshot->dir = dir; + pending_snapshot->inherit = inherit; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto fail; + } + + trans->pending_snapshot = pending_snapshot; + + ret = btrfs_commit_transaction(trans); + if (ret) + goto fail; + + ret = pending_snapshot->error; + if (ret) + goto fail; + + ret = btrfs_orphan_cleanup(pending_snapshot->snap); + if (ret) + goto fail; + + inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto fail; + } + + d_instantiate(dentry, inode); + ret = 0; + pending_snapshot->anon_dev = 0; +fail: + /* Prevent double freeing of anon_dev */ + if (ret && pending_snapshot->snap) + pending_snapshot->snap->anon_dev = 0; + btrfs_put_root(pending_snapshot->snap); + btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv); +free_pending: + if (pending_snapshot->anon_dev) + free_anon_bdev(pending_snapshot->anon_dev); + kfree(pending_snapshot->root_item); + btrfs_free_path(pending_snapshot->path); + kfree(pending_snapshot); + + return ret; +} + +/* copy of may_delete in fs/namei.c() + * Check whether we can remove a link victim from directory dir, check + * whether the type of victim is right. + * 1. We can't do it if dir is read-only (done in permission()) + * 2. We should have write and exec permissions on dir + * 3. We can't remove anything from append-only dir + * 4. We can't do anything with immutable dir (done in permission()) + * 5. If the sticky bit on dir is set we should either + * a. be owner of dir, or + * b. be owner of victim, or + * c. have CAP_FOWNER capability + * 6. If the victim is append-only or immutable we can't do anything with + * links pointing to it. + * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. + * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. + * 9. We can't remove a root or mountpoint. + * 10. We don't allow removal of NFS sillyrenamed files; it's handled by + * nfs_async_unlink(). + */ + +static int btrfs_may_delete(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *victim, int isdir) +{ + int error; + + if (d_really_is_negative(victim)) + return -ENOENT; + + BUG_ON(d_inode(victim->d_parent) != dir); + audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); + + error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); + if (error) + return error; + if (IS_APPEND(dir)) + return -EPERM; + if (check_sticky(idmap, dir, d_inode(victim)) || + IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) || + IS_SWAPFILE(d_inode(victim))) + return -EPERM; + if (isdir) { + if (!d_is_dir(victim)) + return -ENOTDIR; + if (IS_ROOT(victim)) + return -EBUSY; + } else if (d_is_dir(victim)) + return -EISDIR; + if (IS_DEADDIR(dir)) + return -ENOENT; + if (victim->d_flags & DCACHE_NFSFS_RENAMED) + return -EBUSY; + return 0; +} + +/* copy of may_create in fs/namei.c() */ +static inline int btrfs_may_create(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *child) +{ + if (d_really_is_positive(child)) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + if (!fsuidgid_has_mapping(dir->i_sb, idmap)) + return -EOVERFLOW; + return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); +} + +/* + * Create a new subvolume below @parent. This is largely modeled after + * sys_mkdirat and vfs_mkdir, but we only do a single component lookup + * inside this filesystem so it's quite a bit simpler. + */ +static noinline int btrfs_mksubvol(const struct path *parent, + struct mnt_idmap *idmap, + const char *name, int namelen, + struct btrfs_root *snap_src, + bool readonly, + struct btrfs_qgroup_inherit *inherit) +{ + struct inode *dir = d_inode(parent->dentry); + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct dentry *dentry; + struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen); + int error; + + error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (error == -EINTR) + return error; + + dentry = lookup_one(idmap, name, parent->dentry, namelen); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_unlock; + + error = btrfs_may_create(idmap, dir, dentry); + if (error) + goto out_dput; + + /* + * even if this name doesn't exist, we may get hash collisions. + * check for them now when we can safely fail + */ + error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, + dir->i_ino, &name_str); + if (error) + goto out_dput; + + down_read(&fs_info->subvol_sem); + + if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) + goto out_up_read; + + if (snap_src) + error = create_snapshot(snap_src, dir, dentry, readonly, inherit); + else + error = create_subvol(idmap, dir, dentry, inherit); + + if (!error) + fsnotify_mkdir(dir, dentry); +out_up_read: + up_read(&fs_info->subvol_sem); +out_dput: + dput(dentry); +out_unlock: + btrfs_inode_unlock(BTRFS_I(dir), 0); + return error; +} + +static noinline int btrfs_mksnapshot(const struct path *parent, + struct mnt_idmap *idmap, + const char *name, int namelen, + struct btrfs_root *root, + bool readonly, + struct btrfs_qgroup_inherit *inherit) +{ + int ret; + bool snapshot_force_cow = false; + + /* + * Force new buffered writes to reserve space even when NOCOW is + * possible. This is to avoid later writeback (running dealloc) to + * fallback to COW mode and unexpectedly fail with ENOSPC. + */ + btrfs_drew_read_lock(&root->snapshot_lock); + + ret = btrfs_start_delalloc_snapshot(root, false); + if (ret) + goto out; + + /* + * All previous writes have started writeback in NOCOW mode, so now + * we force future writes to fallback to COW mode during snapshot + * creation. + */ + atomic_inc(&root->snapshot_force_cow); + snapshot_force_cow = true; + + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); + + ret = btrfs_mksubvol(parent, idmap, name, namelen, + root, readonly, inherit); +out: + if (snapshot_force_cow) + atomic_dec(&root->snapshot_force_cow); + btrfs_drew_read_unlock(&root->snapshot_lock); + return ret; +} + +/* + * Try to start exclusive operation @type or cancel it if it's running. + * + * Return: + * 0 - normal mode, newly claimed op started + * >0 - normal mode, something else is running, + * return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS to user space + * ECANCELED - cancel mode, successful cancel + * ENOTCONN - cancel mode, operation not running anymore + */ +static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type, bool cancel) +{ + if (!cancel) { + /* Start normal op */ + if (!btrfs_exclop_start(fs_info, type)) + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + /* Exclusive operation is now claimed */ + return 0; + } + + /* Cancel running op */ + if (btrfs_exclop_start_try_lock(fs_info, type)) { + /* + * This blocks any exclop finish from setting it to NONE, so we + * request cancellation. Either it runs and we will wait for it, + * or it has finished and no waiting will happen. + */ + atomic_inc(&fs_info->reloc_cancel_req); + btrfs_exclop_start_unlock(fs_info); + + if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) + wait_on_bit(&fs_info->flags, BTRFS_FS_RELOC_RUNNING, + TASK_INTERRUPTIBLE); + + return -ECANCELED; + } + + /* Something else is running or none */ + return -ENOTCONN; +} + +static noinline int btrfs_ioctl_resize(struct file *file, + void __user *arg) +{ + BTRFS_DEV_LOOKUP_ARGS(args); + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + u64 new_size; + u64 old_size; + u64 devid = 1; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_trans_handle *trans; + struct btrfs_device *device = NULL; + char *sizestr; + char *retptr; + char *devstr = NULL; + int ret = 0; + int mod = 0; + bool cancel; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + /* + * Read the arguments before checking exclusivity to be able to + * distinguish regular resize and cancel + */ + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out_drop; + } + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + sizestr = vol_args->name; + cancel = (strcmp("cancel", sizestr) == 0); + ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel); + if (ret) + goto out_free; + /* Exclusive operation is now claimed */ + + devstr = strchr(sizestr, ':'); + if (devstr) { + sizestr = devstr + 1; + *devstr = '\0'; + devstr = vol_args->name; + ret = kstrtoull(devstr, 10, &devid); + if (ret) + goto out_finish; + if (!devid) { + ret = -EINVAL; + goto out_finish; + } + btrfs_info(fs_info, "resizing devid %llu", devid); + } + + args.devid = devid; + device = btrfs_find_device(fs_info->fs_devices, &args); + if (!device) { + btrfs_info(fs_info, "resizer unable to find device %llu", + devid); + ret = -ENODEV; + goto out_finish; + } + + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { + btrfs_info(fs_info, + "resizer unable to apply on readonly device %llu", + devid); + ret = -EPERM; + goto out_finish; + } + + if (!strcmp(sizestr, "max")) + new_size = bdev_nr_bytes(device->bdev); + else { + if (sizestr[0] == '-') { + mod = -1; + sizestr++; + } else if (sizestr[0] == '+') { + mod = 1; + sizestr++; + } + new_size = memparse(sizestr, &retptr); + if (*retptr != '\0' || new_size == 0) { + ret = -EINVAL; + goto out_finish; + } + } + + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { + ret = -EPERM; + goto out_finish; + } + + old_size = btrfs_device_get_total_bytes(device); + + if (mod < 0) { + if (new_size > old_size) { + ret = -EINVAL; + goto out_finish; + } + new_size = old_size - new_size; + } else if (mod > 0) { + if (new_size > ULLONG_MAX - old_size) { + ret = -ERANGE; + goto out_finish; + } + new_size = old_size + new_size; + } + + if (new_size < SZ_256M) { + ret = -EINVAL; + goto out_finish; + } + if (new_size > bdev_nr_bytes(device->bdev)) { + ret = -EFBIG; + goto out_finish; + } + + new_size = round_down(new_size, fs_info->sectorsize); + + if (new_size > old_size) { + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_finish; + } + ret = btrfs_grow_device(trans, device, new_size); + btrfs_commit_transaction(trans); + } else if (new_size < old_size) { + ret = btrfs_shrink_device(device, new_size); + } /* equal, nothing need to do */ + + if (ret == 0 && new_size != old_size) + btrfs_info_in_rcu(fs_info, + "resize device %s (devid %llu) from %llu to %llu", + btrfs_dev_name(device), device->devid, + old_size, new_size); +out_finish: + btrfs_exclop_finish(fs_info); +out_free: + kfree(vol_args); +out_drop: + mnt_drop_write_file(file); + return ret; +} + +static noinline int __btrfs_ioctl_snap_create(struct file *file, + struct mnt_idmap *idmap, + const char *name, unsigned long fd, int subvol, + bool readonly, + struct btrfs_qgroup_inherit *inherit) +{ + int namelen; + int ret = 0; + + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + + ret = mnt_want_write_file(file); + if (ret) + goto out; + + namelen = strlen(name); + if (strchr(name, '/')) { + ret = -EINVAL; + goto out_drop_write; + } + + if (name[0] == '.' && + (namelen == 1 || (name[1] == '.' && namelen == 2))) { + ret = -EEXIST; + goto out_drop_write; + } + + if (subvol) { + ret = btrfs_mksubvol(&file->f_path, idmap, name, + namelen, NULL, readonly, inherit); + } else { + struct fd src = fdget(fd); + struct inode *src_inode; + if (!src.file) { + ret = -EINVAL; + goto out_drop_write; + } + + src_inode = file_inode(src.file); + if (src_inode->i_sb != file_inode(file)->i_sb) { + btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, + "Snapshot src from another FS"); + ret = -EXDEV; + } else if (!inode_owner_or_capable(idmap, src_inode)) { + /* + * Subvolume creation is not restricted, but snapshots + * are limited to own subvolumes only + */ + ret = -EPERM; + } else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) { + /* + * Snapshots must be made with the src_inode referring + * to the subvolume inode, otherwise the permission + * checking above is useless because we may have + * permission on a lower directory but not the subvol + * itself. + */ + ret = -EINVAL; + } else { + ret = btrfs_mksnapshot(&file->f_path, idmap, + name, namelen, + BTRFS_I(src_inode)->root, + readonly, inherit); + } + fdput(src); + } +out_drop_write: + mnt_drop_write_file(file); +out: + return ret; +} + +static noinline int btrfs_ioctl_snap_create(struct file *file, + void __user *arg, int subvol) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + + ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), + vol_args->name, vol_args->fd, subvol, + false, NULL); + + kfree(vol_args); + return ret; +} + +static noinline int btrfs_ioctl_snap_create_v2(struct file *file, + void __user *arg, int subvol) +{ + struct btrfs_ioctl_vol_args_v2 *vol_args; + int ret; + bool readonly = false; + struct btrfs_qgroup_inherit *inherit = NULL; + + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + + if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) { + ret = -EOPNOTSUPP; + goto free_args; + } + + if (vol_args->flags & BTRFS_SUBVOL_RDONLY) + readonly = true; + if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { + u64 nums; + + if (vol_args->size < sizeof(*inherit) || + vol_args->size > PAGE_SIZE) { + ret = -EINVAL; + goto free_args; + } + inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size); + if (IS_ERR(inherit)) { + ret = PTR_ERR(inherit); + goto free_args; + } + + if (inherit->num_qgroups > PAGE_SIZE || + inherit->num_ref_copies > PAGE_SIZE || + inherit->num_excl_copies > PAGE_SIZE) { + ret = -EINVAL; + goto free_inherit; + } + + nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + + 2 * inherit->num_excl_copies; + if (vol_args->size != struct_size(inherit, qgroups, nums)) { + ret = -EINVAL; + goto free_inherit; + } + } + + ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), + vol_args->name, vol_args->fd, subvol, + readonly, inherit); + if (ret) + goto free_inherit; +free_inherit: + kfree(inherit); +free_args: + kfree(vol_args); + return ret; +} + +static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, + void __user *arg) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + u64 flags = 0; + + if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) + return -EINVAL; + + down_read(&fs_info->subvol_sem); + if (btrfs_root_readonly(root)) + flags |= BTRFS_SUBVOL_RDONLY; + up_read(&fs_info->subvol_sem); + + if (copy_to_user(arg, &flags, sizeof(flags))) + ret = -EFAULT; + + return ret; +} + +static noinline int btrfs_ioctl_subvol_setflags(struct file *file, + void __user *arg) +{ + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 root_flags; + u64 flags; + int ret = 0; + + if (!inode_owner_or_capable(file_mnt_idmap(file), inode)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + goto out; + + if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { + ret = -EINVAL; + goto out_drop_write; + } + + if (copy_from_user(&flags, arg, sizeof(flags))) { + ret = -EFAULT; + goto out_drop_write; + } + + if (flags & ~BTRFS_SUBVOL_RDONLY) { + ret = -EOPNOTSUPP; + goto out_drop_write; + } + + down_write(&fs_info->subvol_sem); + + /* nothing to do */ + if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) + goto out_drop_sem; + + root_flags = btrfs_root_flags(&root->root_item); + if (flags & BTRFS_SUBVOL_RDONLY) { + btrfs_set_root_flags(&root->root_item, + root_flags | BTRFS_ROOT_SUBVOL_RDONLY); + } else { + /* + * Block RO -> RW transition if this subvolume is involved in + * send + */ + spin_lock(&root->root_item_lock); + if (root->send_in_progress == 0) { + btrfs_set_root_flags(&root->root_item, + root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); + spin_unlock(&root->root_item_lock); + } else { + spin_unlock(&root->root_item_lock); + btrfs_warn(fs_info, + "Attempt to set subvolume %llu read-write during send", + root->root_key.objectid); + ret = -EPERM; + goto out_drop_sem; + } + } + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_reset; + } + + ret = btrfs_update_root(trans, fs_info->tree_root, + &root->root_key, &root->root_item); + if (ret < 0) { + btrfs_end_transaction(trans); + goto out_reset; + } + + ret = btrfs_commit_transaction(trans); + +out_reset: + if (ret) + btrfs_set_root_flags(&root->root_item, root_flags); +out_drop_sem: + up_write(&fs_info->subvol_sem); +out_drop_write: + mnt_drop_write_file(file); +out: + return ret; +} + +static noinline int key_in_sk(struct btrfs_key *key, + struct btrfs_ioctl_search_key *sk) +{ + struct btrfs_key test; + int ret; + + test.objectid = sk->min_objectid; + test.type = sk->min_type; + test.offset = sk->min_offset; + + ret = btrfs_comp_cpu_keys(key, &test); + if (ret < 0) + return 0; + + test.objectid = sk->max_objectid; + test.type = sk->max_type; + test.offset = sk->max_offset; + + ret = btrfs_comp_cpu_keys(key, &test); + if (ret > 0) + return 0; + return 1; +} + +static noinline int copy_to_sk(struct btrfs_path *path, + struct btrfs_key *key, + struct btrfs_ioctl_search_key *sk, + u64 *buf_size, + char __user *ubuf, + unsigned long *sk_offset, + int *num_found) +{ + u64 found_transid; + struct extent_buffer *leaf; + struct btrfs_ioctl_search_header sh; + struct btrfs_key test; + unsigned long item_off; + unsigned long item_len; + int nritems; + int i; + int slot; + int ret = 0; + + leaf = path->nodes[0]; + slot = path->slots[0]; + nritems = btrfs_header_nritems(leaf); + + if (btrfs_header_generation(leaf) > sk->max_transid) { + i = nritems; + goto advance_key; + } + found_transid = btrfs_header_generation(leaf); + + for (i = slot; i < nritems; i++) { + item_off = btrfs_item_ptr_offset(leaf, i); + item_len = btrfs_item_size(leaf, i); + + btrfs_item_key_to_cpu(leaf, key, i); + if (!key_in_sk(key, sk)) + continue; + + if (sizeof(sh) + item_len > *buf_size) { + if (*num_found) { + ret = 1; + goto out; + } + + /* + * return one empty item back for v1, which does not + * handle -EOVERFLOW + */ + + *buf_size = sizeof(sh) + item_len; + item_len = 0; + ret = -EOVERFLOW; + } + + if (sizeof(sh) + item_len + *sk_offset > *buf_size) { + ret = 1; + goto out; + } + + sh.objectid = key->objectid; + sh.offset = key->offset; + sh.type = key->type; + sh.len = item_len; + sh.transid = found_transid; + + /* + * Copy search result header. If we fault then loop again so we + * can fault in the pages and -EFAULT there if there's a + * problem. Otherwise we'll fault and then copy the buffer in + * properly this next time through + */ + if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) { + ret = 0; + goto out; + } + + *sk_offset += sizeof(sh); + + if (item_len) { + char __user *up = ubuf + *sk_offset; + /* + * Copy the item, same behavior as above, but reset the + * * sk_offset so we copy the full thing again. + */ + if (read_extent_buffer_to_user_nofault(leaf, up, + item_off, item_len)) { + ret = 0; + *sk_offset -= sizeof(sh); + goto out; + } + + *sk_offset += item_len; + } + (*num_found)++; + + if (ret) /* -EOVERFLOW from above */ + goto out; + + if (*num_found >= sk->nr_items) { + ret = 1; + goto out; + } + } +advance_key: + ret = 0; + test.objectid = sk->max_objectid; + test.type = sk->max_type; + test.offset = sk->max_offset; + if (btrfs_comp_cpu_keys(key, &test) >= 0) + ret = 1; + else if (key->offset < (u64)-1) + key->offset++; + else if (key->type < (u8)-1) { + key->offset = 0; + key->type++; + } else if (key->objectid < (u64)-1) { + key->offset = 0; + key->type = 0; + key->objectid++; + } else + ret = 1; +out: + /* + * 0: all items from this leaf copied, continue with next + * 1: * more items can be copied, but unused buffer is too small + * * all items were found + * Either way, it will stops the loop which iterates to the next + * leaf + * -EOVERFLOW: item was to large for buffer + * -EFAULT: could not copy extent buffer back to userspace + */ + return ret; +} + +static noinline int search_ioctl(struct inode *inode, + struct btrfs_ioctl_search_key *sk, + u64 *buf_size, + char __user *ubuf) +{ + struct btrfs_fs_info *info = btrfs_sb(inode->i_sb); + struct btrfs_root *root; + struct btrfs_key key; + struct btrfs_path *path; + int ret; + int num_found = 0; + unsigned long sk_offset = 0; + + if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) { + *buf_size = sizeof(struct btrfs_ioctl_search_header); + return -EOVERFLOW; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (sk->tree_id == 0) { + /* search the root of the inode that was passed */ + root = btrfs_grab_root(BTRFS_I(inode)->root); + } else { + root = btrfs_get_fs_root(info, sk->tree_id, true); + if (IS_ERR(root)) { + btrfs_free_path(path); + return PTR_ERR(root); + } + } + + key.objectid = sk->min_objectid; + key.type = sk->min_type; + key.offset = sk->min_offset; + + while (1) { + ret = -EFAULT; + /* + * Ensure that the whole user buffer is faulted in at sub-page + * granularity, otherwise the loop may live-lock. + */ + if (fault_in_subpage_writeable(ubuf + sk_offset, + *buf_size - sk_offset)) + break; + + ret = btrfs_search_forward(root, &key, path, sk->min_transid); + if (ret != 0) { + if (ret > 0) + ret = 0; + goto err; + } + ret = copy_to_sk(path, &key, sk, buf_size, ubuf, + &sk_offset, &num_found); + btrfs_release_path(path); + if (ret) + break; + + } + if (ret > 0) + ret = 0; +err: + sk->nr_items = num_found; + btrfs_put_root(root); + btrfs_free_path(path); + return ret; +} + +static noinline int btrfs_ioctl_tree_search(struct inode *inode, + void __user *argp) +{ + struct btrfs_ioctl_search_args __user *uargs = argp; + struct btrfs_ioctl_search_key sk; + int ret; + u64 buf_size; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&sk, &uargs->key, sizeof(sk))) + return -EFAULT; + + buf_size = sizeof(uargs->buf); + + ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); + + /* + * In the origin implementation an overflow is handled by returning a + * search header with a len of zero, so reset ret. + */ + if (ret == -EOVERFLOW) + ret = 0; + + if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk))) + ret = -EFAULT; + return ret; +} + +static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, + void __user *argp) +{ + struct btrfs_ioctl_search_args_v2 __user *uarg = argp; + struct btrfs_ioctl_search_args_v2 args; + int ret; + u64 buf_size; + const u64 buf_limit = SZ_16M; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* copy search header and buffer size */ + if (copy_from_user(&args, uarg, sizeof(args))) + return -EFAULT; + + buf_size = args.buf_size; + + /* limit result size to 16MB */ + if (buf_size > buf_limit) + buf_size = buf_limit; + + ret = search_ioctl(inode, &args.key, &buf_size, + (char __user *)(&uarg->buf[0])); + if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) + ret = -EFAULT; + else if (ret == -EOVERFLOW && + copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size))) + ret = -EFAULT; + + return ret; +} + +/* + * Search INODE_REFs to identify path name of 'dirid' directory + * in a 'tree_id' tree. and sets path name to 'name'. + */ +static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, + u64 tree_id, u64 dirid, char *name) +{ + struct btrfs_root *root; + struct btrfs_key key; + char *ptr; + int ret = -1; + int slot; + int len; + int total_len = 0; + struct btrfs_inode_ref *iref; + struct extent_buffer *l; + struct btrfs_path *path; + + if (dirid == BTRFS_FIRST_FREE_OBJECTID) { + name[0]='\0'; + return 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1]; + + root = btrfs_get_fs_root(info, tree_id, true); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + root = NULL; + goto out; + } + + key.objectid = dirid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_backwards(root, &key, path); + if (ret < 0) + goto out; + else if (ret > 0) { + ret = -ENOENT; + goto out; + } + + l = path->nodes[0]; + slot = path->slots[0]; + + iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(l, iref); + ptr -= len + 1; + total_len += len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto out; + } + + *(ptr + len) = '/'; + read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len); + + if (key.offset == BTRFS_FIRST_FREE_OBJECTID) + break; + + btrfs_release_path(path); + key.objectid = key.offset; + key.offset = (u64)-1; + dirid = key.objectid; + } + memmove(name, ptr, total_len); + name[total_len] = '\0'; + ret = 0; +out: + btrfs_put_root(root); + btrfs_free_path(path); + return ret; +} + +static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, + struct inode *inode, + struct btrfs_ioctl_ino_lookup_user_args *args) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct super_block *sb = inode->i_sb; + struct btrfs_key upper_limit = BTRFS_I(inode)->location; + u64 treeid = BTRFS_I(inode)->root->root_key.objectid; + u64 dirid = args->dirid; + unsigned long item_off; + unsigned long item_len; + struct btrfs_inode_ref *iref; + struct btrfs_root_ref *rref; + struct btrfs_root *root = NULL; + struct btrfs_path *path; + struct btrfs_key key, key2; + struct extent_buffer *leaf; + struct inode *temp_inode; + char *ptr; + int slot; + int len; + int total_len = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * If the bottom subvolume does not exist directly under upper_limit, + * construct the path in from the bottom up. + */ + if (dirid != upper_limit.objectid) { + ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1]; + + root = btrfs_get_fs_root(fs_info, treeid, true); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } + + key.objectid = dirid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + while (1) { + ret = btrfs_search_backwards(root, &key, path); + if (ret < 0) + goto out_put; + else if (ret > 0) { + ret = -ENOENT; + goto out_put; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + + iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(leaf, iref); + ptr -= len + 1; + total_len += len + 1; + if (ptr < args->path) { + ret = -ENAMETOOLONG; + goto out_put; + } + + *(ptr + len) = '/'; + read_extent_buffer(leaf, ptr, + (unsigned long)(iref + 1), len); + + /* Check the read+exec permission of this directory */ + ret = btrfs_previous_item(root, path, dirid, + BTRFS_INODE_ITEM_KEY); + if (ret < 0) { + goto out_put; + } else if (ret > 0) { + ret = -ENOENT; + goto out_put; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key2, slot); + if (key2.objectid != dirid) { + ret = -ENOENT; + goto out_put; + } + + /* + * We don't need the path anymore, so release it and + * avoid deadlocks and lockdep warnings in case + * btrfs_iget() needs to lookup the inode from its root + * btree and lock the same leaf. + */ + btrfs_release_path(path); + temp_inode = btrfs_iget(sb, key2.objectid, root); + if (IS_ERR(temp_inode)) { + ret = PTR_ERR(temp_inode); + goto out_put; + } + ret = inode_permission(idmap, temp_inode, + MAY_READ | MAY_EXEC); + iput(temp_inode); + if (ret) { + ret = -EACCES; + goto out_put; + } + + if (key.offset == upper_limit.objectid) + break; + if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) { + ret = -EACCES; + goto out_put; + } + + key.objectid = key.offset; + key.offset = (u64)-1; + dirid = key.objectid; + } + + memmove(args->path, ptr, total_len); + args->path[total_len] = '\0'; + btrfs_put_root(root); + root = NULL; + btrfs_release_path(path); + } + + /* Get the bottom subvolume's name from ROOT_REF */ + key.objectid = treeid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = args->treeid; + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + item_off = btrfs_item_ptr_offset(leaf, slot); + item_len = btrfs_item_size(leaf, slot); + /* Check if dirid in ROOT_REF corresponds to passed dirid */ + rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); + if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) { + ret = -EINVAL; + goto out; + } + + /* Copy subvolume's name */ + item_off += sizeof(struct btrfs_root_ref); + item_len -= sizeof(struct btrfs_root_ref); + read_extent_buffer(leaf, args->name, item_off, item_len); + args->name[item_len] = 0; + +out_put: + btrfs_put_root(root); +out: + btrfs_free_path(path); + return ret; +} + +static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root, + void __user *argp) +{ + struct btrfs_ioctl_ino_lookup_args *args; + int ret = 0; + + args = memdup_user(argp, sizeof(*args)); + if (IS_ERR(args)) + return PTR_ERR(args); + + /* + * Unprivileged query to obtain the containing subvolume root id. The + * path is reset so it's consistent with btrfs_search_path_in_tree. + */ + if (args->treeid == 0) + args->treeid = root->root_key.objectid; + + if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { + args->name[0] = 0; + goto out; + } + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + + ret = btrfs_search_path_in_tree(root->fs_info, + args->treeid, args->objectid, + args->name); + +out: + if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) + ret = -EFAULT; + + kfree(args); + return ret; +} + +/* + * Version of ino_lookup ioctl (unprivileged) + * + * The main differences from ino_lookup ioctl are: + * + * 1. Read + Exec permission will be checked using inode_permission() during + * path construction. -EACCES will be returned in case of failure. + * 2. Path construction will be stopped at the inode number which corresponds + * to the fd with which this ioctl is called. If constructed path does not + * exist under fd's inode, -EACCES will be returned. + * 3. The name of bottom subvolume is also searched and filled. + */ +static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) +{ + struct btrfs_ioctl_ino_lookup_user_args *args; + struct inode *inode; + int ret; + + args = memdup_user(argp, sizeof(*args)); + if (IS_ERR(args)) + return PTR_ERR(args); + + inode = file_inode(file); + + if (args->dirid == BTRFS_FIRST_FREE_OBJECTID && + BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) { + /* + * The subvolume does not exist under fd with which this is + * called + */ + kfree(args); + return -EACCES; + } + + ret = btrfs_search_path_in_tree_user(file_mnt_idmap(file), inode, args); + + if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) + ret = -EFAULT; + + kfree(args); + return ret; +} + +/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */ +static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) +{ + struct btrfs_ioctl_get_subvol_info_args *subvol_info; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_root_item *root_item; + struct btrfs_root_ref *rref; + struct extent_buffer *leaf; + unsigned long item_off; + unsigned long item_len; + int slot; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL); + if (!subvol_info) { + btrfs_free_path(path); + return -ENOMEM; + } + + fs_info = BTRFS_I(inode)->root->fs_info; + + /* Get root_item of inode's subvolume */ + key.objectid = BTRFS_I(inode)->root->root_key.objectid; + root = btrfs_get_fs_root(fs_info, key.objectid, true); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out_free; + } + root_item = &root->root_item; + + subvol_info->treeid = key.objectid; + + subvol_info->generation = btrfs_root_generation(root_item); + subvol_info->flags = btrfs_root_flags(root_item); + + memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE); + memcpy(subvol_info->parent_uuid, root_item->parent_uuid, + BTRFS_UUID_SIZE); + memcpy(subvol_info->received_uuid, root_item->received_uuid, + BTRFS_UUID_SIZE); + + subvol_info->ctransid = btrfs_root_ctransid(root_item); + subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime); + subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime); + + subvol_info->otransid = btrfs_root_otransid(root_item); + subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime); + subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime); + + subvol_info->stransid = btrfs_root_stransid(root_item); + subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime); + subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime); + + subvol_info->rtransid = btrfs_root_rtransid(root_item); + subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime); + subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime); + + if (key.objectid != BTRFS_FS_TREE_OBJECTID) { + /* Search root tree for ROOT_BACKREF of this subvolume */ + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (path->slots[0] >= + btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(fs_info->tree_root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid == subvol_info->treeid && + key.type == BTRFS_ROOT_BACKREF_KEY) { + subvol_info->parent_id = key.offset; + + rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); + subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref); + + item_off = btrfs_item_ptr_offset(leaf, slot) + + sizeof(struct btrfs_root_ref); + item_len = btrfs_item_size(leaf, slot) + - sizeof(struct btrfs_root_ref); + read_extent_buffer(leaf, subvol_info->name, + item_off, item_len); + } else { + ret = -ENOENT; + goto out; + } + } + + btrfs_free_path(path); + path = NULL; + if (copy_to_user(argp, subvol_info, sizeof(*subvol_info))) + ret = -EFAULT; + +out: + btrfs_put_root(root); +out_free: + btrfs_free_path(path); + kfree(subvol_info); + return ret; +} + +/* + * Return ROOT_REF information of the subvolume containing this inode + * except the subvolume name. + */ +static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, + void __user *argp) +{ + struct btrfs_ioctl_get_subvol_rootref_args *rootrefs; + struct btrfs_root_ref *rref; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; + u64 objectid; + int slot; + int ret; + u8 found; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + rootrefs = memdup_user(argp, sizeof(*rootrefs)); + if (IS_ERR(rootrefs)) { + btrfs_free_path(path); + return PTR_ERR(rootrefs); + } + + objectid = root->root_key.objectid; + key.objectid = objectid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = rootrefs->min_treeid; + found = 0; + + root = root->fs_info->tree_root; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (path->slots[0] >= + btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) { + ret = 0; + goto out; + } + + if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) { + ret = -EOVERFLOW; + goto out; + } + + rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); + rootrefs->rootref[found].treeid = key.offset; + rootrefs->rootref[found].dirid = + btrfs_root_ref_dirid(leaf, rref); + found++; + + ret = btrfs_next_item(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + +out: + btrfs_free_path(path); + + if (!ret || ret == -EOVERFLOW) { + rootrefs->num_items = found; + /* update min_treeid for next search */ + if (found) + rootrefs->min_treeid = + rootrefs->rootref[found - 1].treeid + 1; + if (copy_to_user(argp, rootrefs, sizeof(*rootrefs))) + ret = -EFAULT; + } + + kfree(rootrefs); + + return ret; +} + +static noinline int btrfs_ioctl_snap_destroy(struct file *file, + void __user *arg, + bool destroy_v2) +{ + struct dentry *parent = file->f_path.dentry; + struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb); + struct dentry *dentry; + struct inode *dir = d_inode(parent); + struct inode *inode; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *dest = NULL; + struct btrfs_ioctl_vol_args *vol_args = NULL; + struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; + struct mnt_idmap *idmap = file_mnt_idmap(file); + char *subvol_name, *subvol_name_ptr = NULL; + int subvol_namelen; + int err = 0; + bool destroy_parent = false; + + /* We don't support snapshots with extent tree v2 yet. */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, + "extent tree v2 doesn't support snapshot deletion yet"); + return -EOPNOTSUPP; + } + + if (destroy_v2) { + vol_args2 = memdup_user(arg, sizeof(*vol_args2)); + if (IS_ERR(vol_args2)) + return PTR_ERR(vol_args2); + + if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) { + err = -EOPNOTSUPP; + goto out; + } + + /* + * If SPEC_BY_ID is not set, we are looking for the subvolume by + * name, same as v1 currently does. + */ + if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) { + vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0; + subvol_name = vol_args2->name; + + err = mnt_want_write_file(file); + if (err) + goto out; + } else { + struct inode *old_dir; + + if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) { + err = -EINVAL; + goto out; + } + + err = mnt_want_write_file(file); + if (err) + goto out; + + dentry = btrfs_get_dentry(fs_info->sb, + BTRFS_FIRST_FREE_OBJECTID, + vol_args2->subvolid, 0); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_drop_write; + } + + /* + * Change the default parent since the subvolume being + * deleted can be outside of the current mount point. + */ + parent = btrfs_get_parent(dentry); + + /* + * At this point dentry->d_name can point to '/' if the + * subvolume we want to destroy is outsite of the + * current mount point, so we need to release the + * current dentry and execute the lookup to return a new + * one with ->d_name pointing to the + * /subvol_name. + */ + dput(dentry); + if (IS_ERR(parent)) { + err = PTR_ERR(parent); + goto out_drop_write; + } + old_dir = dir; + dir = d_inode(parent); + + /* + * If v2 was used with SPEC_BY_ID, a new parent was + * allocated since the subvolume can be outside of the + * current mount point. Later on we need to release this + * new parent dentry. + */ + destroy_parent = true; + + /* + * On idmapped mounts, deletion via subvolid is + * restricted to subvolumes that are immediate + * ancestors of the inode referenced by the file + * descriptor in the ioctl. Otherwise the idmapping + * could potentially be abused to delete subvolumes + * anywhere in the filesystem the user wouldn't be able + * to delete without an idmapped mount. + */ + if (old_dir != dir && idmap != &nop_mnt_idmap) { + err = -EOPNOTSUPP; + goto free_parent; + } + + subvol_name_ptr = btrfs_get_subvol_name_from_objectid( + fs_info, vol_args2->subvolid); + if (IS_ERR(subvol_name_ptr)) { + err = PTR_ERR(subvol_name_ptr); + goto free_parent; + } + /* subvol_name_ptr is already nul terminated */ + subvol_name = (char *)kbasename(subvol_name_ptr); + } + } else { + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + vol_args->name[BTRFS_PATH_NAME_MAX] = 0; + subvol_name = vol_args->name; + + err = mnt_want_write_file(file); + if (err) + goto out; + } + + subvol_namelen = strlen(subvol_name); + + if (strchr(subvol_name, '/') || + strncmp(subvol_name, "..", subvol_namelen) == 0) { + err = -EINVAL; + goto free_subvol_name; + } + + if (!S_ISDIR(dir->i_mode)) { + err = -ENOTDIR; + goto free_subvol_name; + } + + err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (err == -EINTR) + goto free_subvol_name; + dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_unlock_dir; + } + + if (d_really_is_negative(dentry)) { + err = -ENOENT; + goto out_dput; + } + + inode = d_inode(dentry); + dest = BTRFS_I(inode)->root; + if (!capable(CAP_SYS_ADMIN)) { + /* + * Regular user. Only allow this with a special mount + * option, when the user has write+exec access to the + * subvol root, and when rmdir(2) would have been + * allowed. + * + * Note that this is _not_ check that the subvol is + * empty or doesn't contain data that we wouldn't + * otherwise be able to delete. + * + * Users who want to delete empty subvols should try + * rmdir(2). + */ + err = -EPERM; + if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED)) + goto out_dput; + + /* + * Do not allow deletion if the parent dir is the same + * as the dir to be deleted. That means the ioctl + * must be called on the dentry referencing the root + * of the subvol, not a random directory contained + * within it. + */ + err = -EINVAL; + if (root == dest) + goto out_dput; + + err = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC); + if (err) + goto out_dput; + } + + /* check if subvolume may be deleted by a user */ + err = btrfs_may_delete(idmap, dir, dentry, 1); + if (err) + goto out_dput; + + if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { + err = -EINVAL; + goto out_dput; + } + + btrfs_inode_lock(BTRFS_I(inode), 0); + err = btrfs_delete_subvolume(BTRFS_I(dir), dentry); + btrfs_inode_unlock(BTRFS_I(inode), 0); + if (!err) + d_delete_notify(dir, dentry); + +out_dput: + dput(dentry); +out_unlock_dir: + btrfs_inode_unlock(BTRFS_I(dir), 0); +free_subvol_name: + kfree(subvol_name_ptr); +free_parent: + if (destroy_parent) + dput(parent); +out_drop_write: + mnt_drop_write_file(file); +out: + kfree(vol_args2); + kfree(vol_args); + return err; +} + +static int btrfs_ioctl_defrag(struct file *file, void __user *argp) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ioctl_defrag_range_args range = {0}; + int ret; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + if (btrfs_root_readonly(root)) { + ret = -EROFS; + goto out; + } + + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + ret = btrfs_defrag_root(root); + break; + case S_IFREG: + /* + * Note that this does not check the file descriptor for write + * access. This prevents defragmenting executables that are + * running and allows defrag on files open in read-only mode. + */ + if (!capable(CAP_SYS_ADMIN) && + inode_permission(&nop_mnt_idmap, inode, MAY_WRITE)) { + ret = -EPERM; + goto out; + } + + if (argp) { + if (copy_from_user(&range, argp, sizeof(range))) { + ret = -EFAULT; + goto out; + } + if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) { + ret = -EOPNOTSUPP; + goto out; + } + /* compression requires us to start the IO */ + if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { + range.flags |= BTRFS_DEFRAG_RANGE_START_IO; + range.extent_thresh = (u32)-1; + } + } else { + /* the rest are all set to zero by kzalloc */ + range.len = (u64)-1; + } + ret = btrfs_defrag_file(file_inode(file), &file->f_ra, + &range, BTRFS_OLDEST_GENERATION, 0); + if (ret > 0) + ret = 0; + break; + default: + ret = -EINVAL; + } +out: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + bool restore_op = false; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "device add not supported on extent tree v2 yet"); + return -EINVAL; + } + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) { + if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD)) + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + + /* + * We can do the device add because we have a paused balanced, + * change the exclusive op type and remember we should bring + * back the paused balance + */ + fs_info->exclusive_operation = BTRFS_EXCLOP_DEV_ADD; + btrfs_exclop_start_unlock(fs_info); + restore_op = true; + } + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_init_new_device(fs_info, vol_args->name); + + if (!ret) + btrfs_info(fs_info, "disk added %s", vol_args->name); + + kfree(vol_args); +out: + if (restore_op) + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); + else + btrfs_exclop_finish(fs_info); + return ret; +} + +static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) +{ + BTRFS_DEV_LOOKUP_ARGS(args); + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_ioctl_vol_args_v2 *vol_args; + struct block_device *bdev = NULL; + void *holder; + int ret; + bool cancel = false; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) { + ret = -EOPNOTSUPP; + goto out; + } + + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { + args.devid = vol_args->devid; + } else if (!strcmp("cancel", vol_args->name)) { + cancel = true; + } else { + ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name); + if (ret) + goto out; + } + + ret = mnt_want_write_file(file); + if (ret) + goto out; + + ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, + cancel); + if (ret) + goto err_drop; + + /* Exclusive operation is now claimed */ + ret = btrfs_rm_device(fs_info, &args, &bdev, &holder); + + btrfs_exclop_finish(fs_info); + + if (!ret) { + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) + btrfs_info(fs_info, "device deleted: id %llu", + vol_args->devid); + else + btrfs_info(fs_info, "device deleted: %s", + vol_args->name); + } +err_drop: + mnt_drop_write_file(file); + if (bdev) + blkdev_put(bdev, holder); +out: + btrfs_put_dev_args_from_path(&args); + kfree(vol_args); + return ret; +} + +static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) +{ + BTRFS_DEV_LOOKUP_ARGS(args); + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_ioctl_vol_args *vol_args; + struct block_device *bdev = NULL; + void *holder; + int ret; + bool cancel = false; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + if (!strcmp("cancel", vol_args->name)) { + cancel = true; + } else { + ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name); + if (ret) + goto out; + } + + ret = mnt_want_write_file(file); + if (ret) + goto out; + + ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, + cancel); + if (ret == 0) { + ret = btrfs_rm_device(fs_info, &args, &bdev, &holder); + if (!ret) + btrfs_info(fs_info, "disk deleted %s", vol_args->name); + btrfs_exclop_finish(fs_info); + } + + mnt_drop_write_file(file); + if (bdev) + blkdev_put(bdev, holder); +out: + btrfs_put_dev_args_from_path(&args); + kfree(vol_args); + return ret; +} + +static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, + void __user *arg) +{ + struct btrfs_ioctl_fs_info_args *fi_args; + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u64 flags_in; + int ret = 0; + + fi_args = memdup_user(arg, sizeof(*fi_args)); + if (IS_ERR(fi_args)) + return PTR_ERR(fi_args); + + flags_in = fi_args->flags; + memset(fi_args, 0, sizeof(*fi_args)); + + rcu_read_lock(); + fi_args->num_devices = fs_devices->num_devices; + + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { + if (device->devid > fi_args->max_id) + fi_args->max_id = device->devid; + } + rcu_read_unlock(); + + memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid)); + fi_args->nodesize = fs_info->nodesize; + fi_args->sectorsize = fs_info->sectorsize; + fi_args->clone_alignment = fs_info->sectorsize; + + if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) { + fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy); + fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy); + fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO; + } + + if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) { + fi_args->generation = fs_info->generation; + fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION; + } + + if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) { + memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid, + sizeof(fi_args->metadata_uuid)); + fi_args->flags |= BTRFS_FS_INFO_FLAG_METADATA_UUID; + } + + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) + ret = -EFAULT; + + kfree(fi_args); + return ret; +} + +static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, + void __user *arg) +{ + BTRFS_DEV_LOOKUP_ARGS(args); + struct btrfs_ioctl_dev_info_args *di_args; + struct btrfs_device *dev; + int ret = 0; + + di_args = memdup_user(arg, sizeof(*di_args)); + if (IS_ERR(di_args)) + return PTR_ERR(di_args); + + args.devid = di_args->devid; + if (!btrfs_is_empty_uuid(di_args->uuid)) + args.uuid = di_args->uuid; + + rcu_read_lock(); + dev = btrfs_find_device(fs_info->fs_devices, &args); + if (!dev) { + ret = -ENODEV; + goto out; + } + + di_args->devid = dev->devid; + di_args->bytes_used = btrfs_device_get_bytes_used(dev); + di_args->total_bytes = btrfs_device_get_total_bytes(dev); + memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); + memcpy(di_args->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE); + if (dev->name) + strscpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path)); + else + di_args->path[0] = '\0'; + +out: + rcu_read_unlock(); + if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) + ret = -EFAULT; + + kfree(di_args); + return ret; +} + +static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) +{ + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *new_root; + struct btrfs_dir_item *di; + struct btrfs_trans_handle *trans; + struct btrfs_path *path = NULL; + struct btrfs_disk_key disk_key; + struct fscrypt_str name = FSTR_INIT("default", 7); + u64 objectid = 0; + u64 dir_id; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + if (copy_from_user(&objectid, argp, sizeof(objectid))) { + ret = -EFAULT; + goto out; + } + + if (!objectid) + objectid = BTRFS_FS_TREE_OBJECTID; + + new_root = btrfs_get_fs_root(fs_info, objectid, true); + if (IS_ERR(new_root)) { + ret = PTR_ERR(new_root); + goto out; + } + if (!is_fstree(new_root->root_key.objectid)) { + ret = -ENOENT; + goto out_free; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out_free; + } + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_free; + } + + dir_id = btrfs_super_root_dir(fs_info->super_copy); + di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path, + dir_id, &name, 1); + if (IS_ERR_OR_NULL(di)) { + btrfs_release_path(path); + btrfs_end_transaction(trans); + btrfs_err(fs_info, + "Umm, you don't have the default diritem, this isn't going to work"); + ret = -ENOENT; + goto out_free; + } + + btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); + btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); + btrfs_release_path(path); + + btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); + btrfs_end_transaction(trans); +out_free: + btrfs_put_root(new_root); + btrfs_free_path(path); +out: + mnt_drop_write_file(file); + return ret; +} + +static void get_block_group_info(struct list_head *groups_list, + struct btrfs_ioctl_space_info *space) +{ + struct btrfs_block_group *block_group; + + space->total_bytes = 0; + space->used_bytes = 0; + space->flags = 0; + list_for_each_entry(block_group, groups_list, list) { + space->flags = block_group->flags; + space->total_bytes += block_group->length; + space->used_bytes += block_group->used; + } +} + +static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, + void __user *arg) +{ + struct btrfs_ioctl_space_args space_args = { 0 }; + struct btrfs_ioctl_space_info space; + struct btrfs_ioctl_space_info *dest; + struct btrfs_ioctl_space_info *dest_orig; + struct btrfs_ioctl_space_info __user *user_dest; + struct btrfs_space_info *info; + static const u64 types[] = { + BTRFS_BLOCK_GROUP_DATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA + }; + int num_types = 4; + int alloc_size; + int ret = 0; + u64 slot_count = 0; + int i, c; + + if (copy_from_user(&space_args, + (struct btrfs_ioctl_space_args __user *)arg, + sizeof(space_args))) + return -EFAULT; + + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + info = NULL; + list_for_each_entry(tmp, &fs_info->space_info, list) { + if (tmp->flags == types[i]) { + info = tmp; + break; + } + } + + if (!info) + continue; + + down_read(&info->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&info->block_groups[c])) + slot_count++; + } + up_read(&info->groups_sem); + } + + /* + * Global block reserve, exported as a space_info + */ + slot_count++; + + /* space_slots == 0 means they are asking for a count */ + if (space_args.space_slots == 0) { + space_args.total_spaces = slot_count; + goto out; + } + + slot_count = min_t(u64, space_args.space_slots, slot_count); + + alloc_size = sizeof(*dest) * slot_count; + + /* we generally have at most 6 or so space infos, one for each raid + * level. So, a whole page should be more than enough for everyone + */ + if (alloc_size > PAGE_SIZE) + return -ENOMEM; + + space_args.total_spaces = 0; + dest = kmalloc(alloc_size, GFP_KERNEL); + if (!dest) + return -ENOMEM; + dest_orig = dest; + + /* now we have a buffer to copy into */ + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + if (!slot_count) + break; + + info = NULL; + list_for_each_entry(tmp, &fs_info->space_info, list) { + if (tmp->flags == types[i]) { + info = tmp; + break; + } + } + + if (!info) + continue; + down_read(&info->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&info->block_groups[c])) { + get_block_group_info(&info->block_groups[c], + &space); + memcpy(dest, &space, sizeof(space)); + dest++; + space_args.total_spaces++; + slot_count--; + } + if (!slot_count) + break; + } + up_read(&info->groups_sem); + } + + /* + * Add global block reserve + */ + if (slot_count) { + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + + spin_lock(&block_rsv->lock); + space.total_bytes = block_rsv->size; + space.used_bytes = block_rsv->size - block_rsv->reserved; + spin_unlock(&block_rsv->lock); + space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV; + memcpy(dest, &space, sizeof(space)); + space_args.total_spaces++; + } + + user_dest = (struct btrfs_ioctl_space_info __user *) + (arg + sizeof(struct btrfs_ioctl_space_args)); + + if (copy_to_user(user_dest, dest_orig, alloc_size)) + ret = -EFAULT; + + kfree(dest_orig); +out: + if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args))) + ret = -EFAULT; + + return ret; +} + +static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, + void __user *argp) +{ + struct btrfs_trans_handle *trans; + u64 transid; + + /* + * Start orphan cleanup here for the given root in case it hasn't been + * started already by other means. Errors are handled in the other + * functions during transaction commit. + */ + btrfs_orphan_cleanup(root); + + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) + return PTR_ERR(trans); + + /* No running transaction, don't bother */ + transid = root->fs_info->last_trans_committed; + goto out; + } + transid = trans->transid; + btrfs_commit_transaction_async(trans); +out: + if (argp) + if (copy_to_user(argp, &transid, sizeof(transid))) + return -EFAULT; + return 0; +} + +static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, + void __user *argp) +{ + /* By default wait for the current transaction. */ + u64 transid = 0; + + if (argp) + if (copy_from_user(&transid, argp, sizeof(transid))) + return -EFAULT; + + return btrfs_wait_for_commit(fs_info, transid); +} + +static long btrfs_ioctl_scrub(struct file *file, void __user *arg) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb); + struct btrfs_ioctl_scrub_args *sa; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet"); + return -EINVAL; + } + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) { + ret = -EOPNOTSUPP; + goto out; + } + + if (!(sa->flags & BTRFS_SCRUB_READONLY)) { + ret = mnt_want_write_file(file); + if (ret) + goto out; + } + + ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end, + &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, + 0); + + /* + * Copy scrub args to user space even if btrfs_scrub_dev() returned an + * error. This is important as it allows user space to know how much + * progress scrub has done. For example, if scrub is canceled we get + * -ECANCELED from btrfs_scrub_dev() and return that error back to user + * space. Later user space can inspect the progress from the structure + * btrfs_ioctl_scrub_args and resume scrub from where it left off + * previously (btrfs-progs does this). + * If we fail to copy the btrfs_ioctl_scrub_args structure to user space + * then return -EFAULT to signal the structure was not copied or it may + * be corrupt and unreliable due to a partial copy. + */ + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + if (!(sa->flags & BTRFS_SCRUB_READONLY)) + mnt_drop_write_file(file); +out: + kfree(sa); + return ret; +} + +static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btrfs_scrub_cancel(fs_info); +} + +static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info, + void __user *arg) +{ + struct btrfs_ioctl_scrub_args *sa; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress); + + if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + kfree(sa); + return ret; +} + +static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info, + void __user *arg) +{ + struct btrfs_ioctl_get_dev_stats *sa; + int ret; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) { + kfree(sa); + return -EPERM; + } + + ret = btrfs_get_dev_stats(fs_info, sa); + + if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + kfree(sa); + return ret; +} + +static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, + void __user *arg) +{ + struct btrfs_ioctl_dev_replace_args *p; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "device replace not supported on extent tree v2 yet"); + return -EINVAL; + } + + p = memdup_user(arg, sizeof(*p)); + if (IS_ERR(p)) + return PTR_ERR(p); + + switch (p->cmd) { + case BTRFS_IOCTL_DEV_REPLACE_CMD_START: + if (sb_rdonly(fs_info->sb)) { + ret = -EROFS; + goto out; + } + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + } else { + ret = btrfs_dev_replace_by_ioctl(fs_info, p); + btrfs_exclop_finish(fs_info); + } + break; + case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: + btrfs_dev_replace_status(fs_info, p); + ret = 0; + break; + case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: + p->result = btrfs_dev_replace_cancel(fs_info); + ret = 0; + break; + default: + ret = -EINVAL; + break; + } + + if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p))) + ret = -EFAULT; +out: + kfree(p); + return ret; +} + +static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) +{ + int ret = 0; + int i; + u64 rel_ptr; + int size; + struct btrfs_ioctl_ino_path_args *ipa = NULL; + struct inode_fs_paths *ipath = NULL; + struct btrfs_path *path; + + if (!capable(CAP_DAC_READ_SEARCH)) + return -EPERM; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ipa = memdup_user(arg, sizeof(*ipa)); + if (IS_ERR(ipa)) { + ret = PTR_ERR(ipa); + ipa = NULL; + goto out; + } + + size = min_t(u32, ipa->size, 4096); + ipath = init_ipath(size, root, path); + if (IS_ERR(ipath)) { + ret = PTR_ERR(ipath); + ipath = NULL; + goto out; + } + + ret = paths_from_inode(ipa->inum, ipath); + if (ret < 0) + goto out; + + for (i = 0; i < ipath->fspath->elem_cnt; ++i) { + rel_ptr = ipath->fspath->val[i] - + (u64)(unsigned long)ipath->fspath->val; + ipath->fspath->val[i] = rel_ptr; + } + + btrfs_free_path(path); + path = NULL; + ret = copy_to_user((void __user *)(unsigned long)ipa->fspath, + ipath->fspath, size); + if (ret) { + ret = -EFAULT; + goto out; + } + +out: + btrfs_free_path(path); + free_ipath(ipath); + kfree(ipa); + + return ret; +} + +static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, + void __user *arg, int version) +{ + int ret = 0; + int size; + struct btrfs_ioctl_logical_ino_args *loi; + struct btrfs_data_container *inodes = NULL; + struct btrfs_path *path = NULL; + bool ignore_offset; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + loi = memdup_user(arg, sizeof(*loi)); + if (IS_ERR(loi)) + return PTR_ERR(loi); + + if (version == 1) { + ignore_offset = false; + size = min_t(u32, loi->size, SZ_64K); + } else { + /* All reserved bits must be 0 for now */ + if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) { + ret = -EINVAL; + goto out_loi; + } + /* Only accept flags we have defined so far */ + if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) { + ret = -EINVAL; + goto out_loi; + } + ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET; + size = min_t(u32, loi->size, SZ_16M); + } + + inodes = init_data_container(size); + if (IS_ERR(inodes)) { + ret = PTR_ERR(inodes); + goto out_loi; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + ret = iterate_inodes_from_logical(loi->logical, fs_info, path, + inodes, ignore_offset); + btrfs_free_path(path); + if (ret == -EINVAL) + ret = -ENOENT; + if (ret < 0) + goto out; + + ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes, + size); + if (ret) + ret = -EFAULT; + +out: + kvfree(inodes); +out_loi: + kfree(loi); + + return ret; +} + +void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_balance_args *bargs) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + bargs->flags = bctl->flags; + + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) + bargs->state |= BTRFS_BALANCE_STATE_RUNNING; + if (atomic_read(&fs_info->balance_pause_req)) + bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; + if (atomic_read(&fs_info->balance_cancel_req)) + bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ; + + memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); + memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); + memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); + + spin_lock(&fs_info->balance_lock); + memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); + spin_unlock(&fs_info->balance_lock); +} + +/* + * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as + * required. + * + * @fs_info: the filesystem + * @excl_acquired: ptr to boolean value which is set to false in case balance + * is being resumed + * + * Return 0 on success in which case both fs_info::balance is acquired as well + * as exclusive ops are blocked. In case of failure return an error code. + */ +static int btrfs_try_lock_balance(struct btrfs_fs_info *fs_info, bool *excl_acquired) +{ + int ret; + + /* + * Exclusive operation is locked. Three possibilities: + * (1) some other op is running + * (2) balance is running + * (3) balance is paused -- special case (think resume) + */ + while (1) { + if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + *excl_acquired = true; + mutex_lock(&fs_info->balance_mutex); + return 0; + } + + mutex_lock(&fs_info->balance_mutex); + if (fs_info->balance_ctl) { + /* This is either (2) or (3) */ + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + /* This is (2) */ + ret = -EINPROGRESS; + goto out_failure; + + } else { + mutex_unlock(&fs_info->balance_mutex); + /* + * Lock released to allow other waiters to + * continue, we'll reexamine the status again. + */ + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl && + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + /* This is (3) */ + *excl_acquired = false; + return 0; + } + } + } else { + /* This is (1) */ + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out_failure; + } + + mutex_unlock(&fs_info->balance_mutex); + } + +out_failure: + mutex_unlock(&fs_info->balance_mutex); + *excl_acquired = false; + return ret; +} + +static long btrfs_ioctl_balance(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ioctl_balance_args *bargs; + struct btrfs_balance_control *bctl; + bool need_unlock = true; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + bargs = memdup_user(arg, sizeof(*bargs)); + if (IS_ERR(bargs)) { + ret = PTR_ERR(bargs); + bargs = NULL; + goto out; + } + + ret = btrfs_try_lock_balance(fs_info, &need_unlock); + if (ret) + goto out; + + lockdep_assert_held(&fs_info->balance_mutex); + + if (bargs->flags & BTRFS_BALANCE_RESUME) { + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; + goto out_unlock; + } + + bctl = fs_info->balance_ctl; + spin_lock(&fs_info->balance_lock); + bctl->flags |= BTRFS_BALANCE_RESUME; + spin_unlock(&fs_info->balance_lock); + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE); + + goto do_balance; + } + + if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { + ret = -EINVAL; + goto out_unlock; + } + + if (fs_info->balance_ctl) { + ret = -EINPROGRESS; + goto out_unlock; + } + + bctl = kzalloc(sizeof(*bctl), GFP_KERNEL); + if (!bctl) { + ret = -ENOMEM; + goto out_unlock; + } + + memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); + memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); + memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); + + bctl->flags = bargs->flags; +do_balance: + /* + * Ownership of bctl and exclusive operation goes to btrfs_balance. + * bctl is freed in reset_balance_state, or, if restriper was paused + * all the way until unmount, in free_fs_info. The flag should be + * cleared after reset_balance_state. + */ + need_unlock = false; + + ret = btrfs_balance(fs_info, bctl, bargs); + bctl = NULL; + + if (ret == 0 || ret == -ECANCELED) { + if (copy_to_user(arg, bargs, sizeof(*bargs))) + ret = -EFAULT; + } + + kfree(bctl); +out_unlock: + mutex_unlock(&fs_info->balance_mutex); + if (need_unlock) + btrfs_exclop_finish(fs_info); +out: + mnt_drop_write_file(file); + kfree(bargs); + return ret; +} + +static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case BTRFS_BALANCE_CTL_PAUSE: + return btrfs_pause_balance(fs_info); + case BTRFS_BALANCE_CTL_CANCEL: + return btrfs_cancel_balance(fs_info); + } + + return -EINVAL; +} + +static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info, + void __user *arg) +{ + struct btrfs_ioctl_balance_args *bargs; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; + goto out; + } + + bargs = kzalloc(sizeof(*bargs), GFP_KERNEL); + if (!bargs) { + ret = -ENOMEM; + goto out; + } + + btrfs_update_ioctl_balance_args(fs_info, bargs); + + if (copy_to_user(arg, bargs, sizeof(*bargs))) + ret = -EFAULT; + + kfree(bargs); +out: + mutex_unlock(&fs_info->balance_mutex); + return ret; +} + +static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) +{ + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_ioctl_quota_ctl_args *sa; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } + + down_write(&fs_info->subvol_sem); + + switch (sa->cmd) { + case BTRFS_QUOTA_CTL_ENABLE: + ret = btrfs_quota_enable(fs_info); + break; + case BTRFS_QUOTA_CTL_DISABLE: + ret = btrfs_quota_disable(fs_info); + break; + default: + ret = -EINVAL; + break; + } + + kfree(sa); + up_write(&fs_info->subvol_sem); +drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) +{ + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ioctl_qgroup_assign_args *sa; + struct btrfs_trans_handle *trans; + int ret; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + if (sa->assign) { + ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst); + } else { + ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst); + } + + /* update qgroup status and info */ + mutex_lock(&fs_info->qgroup_ioctl_lock); + err = btrfs_run_qgroups(trans); + mutex_unlock(&fs_info->qgroup_ioctl_lock); + if (err < 0) + btrfs_handle_fs_error(fs_info, err, + "failed to update qgroup status and info"); + err = btrfs_end_transaction(trans); + if (err && !ret) + ret = err; + +out: + kfree(sa); +drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ioctl_qgroup_create_args *sa; + struct btrfs_trans_handle *trans; + int ret; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } + + if (!sa->qgroupid) { + ret = -EINVAL; + goto out; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + if (sa->create) { + ret = btrfs_create_qgroup(trans, sa->qgroupid); + } else { + ret = btrfs_remove_qgroup(trans, sa->qgroupid); + } + + err = btrfs_end_transaction(trans); + if (err && !ret) + ret = err; + +out: + kfree(sa); +drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ioctl_qgroup_limit_args *sa; + struct btrfs_trans_handle *trans; + int ret; + int err; + u64 qgroupid; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + qgroupid = sa->qgroupid; + if (!qgroupid) { + /* take the current subvol as qgroup */ + qgroupid = root->root_key.objectid; + } + + ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim); + + err = btrfs_end_transaction(trans); + if (err && !ret) + ret = err; + +out: + kfree(sa); +drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) +{ + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_ioctl_quota_rescan_args *qsa; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + qsa = memdup_user(arg, sizeof(*qsa)); + if (IS_ERR(qsa)) { + ret = PTR_ERR(qsa); + goto drop_write; + } + + if (qsa->flags) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_qgroup_rescan(fs_info); + +out: + kfree(qsa); +drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, + void __user *arg) +{ + struct btrfs_ioctl_quota_rescan_args qsa = {0}; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + qsa.flags = 1; + qsa.progress = fs_info->qgroup_rescan_progress.objectid; + } + + if (copy_to_user(arg, &qsa, sizeof(qsa))) + return -EFAULT; + + return 0; +} + +static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, + void __user *arg) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btrfs_qgroup_wait_for_completion(fs_info, true); +} + +static long _btrfs_ioctl_set_received_subvol(struct file *file, + struct mnt_idmap *idmap, + struct btrfs_ioctl_received_subvol_args *sa) +{ + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root_item *root_item = &root->root_item; + struct btrfs_trans_handle *trans; + struct timespec64 ct = current_time(inode); + int ret = 0; + int received_uuid_changed; + + if (!inode_owner_or_capable(idmap, inode)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret < 0) + return ret; + + down_write(&fs_info->subvol_sem); + + if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { + ret = -EINVAL; + goto out; + } + + if (btrfs_root_readonly(root)) { + ret = -EROFS; + goto out; + } + + /* + * 1 - root item + * 2 - uuid items (received uuid + subvol uuid) + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + + sa->rtransid = trans->transid; + sa->rtime.sec = ct.tv_sec; + sa->rtime.nsec = ct.tv_nsec; + + received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid, + BTRFS_UUID_SIZE); + if (received_uuid_changed && + !btrfs_is_empty_uuid(root_item->received_uuid)) { + ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + root->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out; + } + } + memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE); + btrfs_set_root_stransid(root_item, sa->stransid); + btrfs_set_root_rtransid(root_item, sa->rtransid); + btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec); + btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec); + btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec); + btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec); + + ret = btrfs_update_root(trans, fs_info->tree_root, + &root->root_key, &root->root_item); + if (ret < 0) { + btrfs_end_transaction(trans); + goto out; + } + if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { + ret = btrfs_uuid_tree_add(trans, sa->uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + root->root_key.objectid); + if (ret < 0 && ret != -EEXIST) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out; + } + } + ret = btrfs_commit_transaction(trans); +out: + up_write(&fs_info->subvol_sem); + mnt_drop_write_file(file); + return ret; +} + +#ifdef CONFIG_64BIT +static long btrfs_ioctl_set_received_subvol_32(struct file *file, + void __user *arg) +{ + struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL; + struct btrfs_ioctl_received_subvol_args *args64 = NULL; + int ret = 0; + + args32 = memdup_user(arg, sizeof(*args32)); + if (IS_ERR(args32)) + return PTR_ERR(args32); + + args64 = kmalloc(sizeof(*args64), GFP_KERNEL); + if (!args64) { + ret = -ENOMEM; + goto out; + } + + memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE); + args64->stransid = args32->stransid; + args64->rtransid = args32->rtransid; + args64->stime.sec = args32->stime.sec; + args64->stime.nsec = args32->stime.nsec; + args64->rtime.sec = args32->rtime.sec; + args64->rtime.nsec = args32->rtime.nsec; + args64->flags = args32->flags; + + ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), args64); + if (ret) + goto out; + + memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE); + args32->stransid = args64->stransid; + args32->rtransid = args64->rtransid; + args32->stime.sec = args64->stime.sec; + args32->stime.nsec = args64->stime.nsec; + args32->rtime.sec = args64->rtime.sec; + args32->rtime.nsec = args64->rtime.nsec; + args32->flags = args64->flags; + + ret = copy_to_user(arg, args32, sizeof(*args32)); + if (ret) + ret = -EFAULT; + +out: + kfree(args32); + kfree(args64); + return ret; +} +#endif + +static long btrfs_ioctl_set_received_subvol(struct file *file, + void __user *arg) +{ + struct btrfs_ioctl_received_subvol_args *sa = NULL; + int ret = 0; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), sa); + + if (ret) + goto out; + + ret = copy_to_user(arg, sa, sizeof(*sa)); + if (ret) + ret = -EFAULT; + +out: + kfree(sa); + return ret; +} + +static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info, + void __user *arg) +{ + size_t len; + int ret; + char label[BTRFS_LABEL_SIZE]; + + spin_lock(&fs_info->super_lock); + memcpy(label, fs_info->super_copy->label, BTRFS_LABEL_SIZE); + spin_unlock(&fs_info->super_lock); + + len = strnlen(label, BTRFS_LABEL_SIZE); + + if (len == BTRFS_LABEL_SIZE) { + btrfs_warn(fs_info, + "label is too long, return the first %zu bytes", + --len); + } + + ret = copy_to_user(arg, label, len); + + return ret ? -EFAULT : 0; +} + +static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) +{ + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_super_block *super_block = fs_info->super_copy; + struct btrfs_trans_handle *trans; + char label[BTRFS_LABEL_SIZE]; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(label, arg, sizeof(label))) + return -EFAULT; + + if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { + btrfs_err(fs_info, + "unable to set label with more than %d bytes", + BTRFS_LABEL_SIZE - 1); + return -EINVAL; + } + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } + + spin_lock(&fs_info->super_lock); + strcpy(super_block->label, label); + spin_unlock(&fs_info->super_lock); + ret = btrfs_commit_transaction(trans); + +out_unlock: + mnt_drop_write_file(file); + return ret; +} + +#define INIT_FEATURE_FLAGS(suffix) \ + { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \ + .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \ + .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix } + +int btrfs_ioctl_get_supported_features(void __user *arg) +{ + static const struct btrfs_ioctl_feature_flags features[3] = { + INIT_FEATURE_FLAGS(SUPP), + INIT_FEATURE_FLAGS(SAFE_SET), + INIT_FEATURE_FLAGS(SAFE_CLEAR) + }; + + if (copy_to_user(arg, &features, sizeof(features))) + return -EFAULT; + + return 0; +} + +static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info, + void __user *arg) +{ + struct btrfs_super_block *super_block = fs_info->super_copy; + struct btrfs_ioctl_feature_flags features; + + features.compat_flags = btrfs_super_compat_flags(super_block); + features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block); + features.incompat_flags = btrfs_super_incompat_flags(super_block); + + if (copy_to_user(arg, &features, sizeof(features))) + return -EFAULT; + + return 0; +} + +static int check_feature_bits(struct btrfs_fs_info *fs_info, + enum btrfs_feature_set set, + u64 change_mask, u64 flags, u64 supported_flags, + u64 safe_set, u64 safe_clear) +{ + const char *type = btrfs_feature_set_name(set); + char *names; + u64 disallowed, unsupported; + u64 set_mask = flags & change_mask; + u64 clear_mask = ~flags & change_mask; + + unsupported = set_mask & ~supported_flags; + if (unsupported) { + names = btrfs_printable_features(set, unsupported); + if (names) { + btrfs_warn(fs_info, + "this kernel does not support the %s feature bit%s", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(fs_info, + "this kernel does not support %s bits 0x%llx", + type, unsupported); + return -EOPNOTSUPP; + } + + disallowed = set_mask & ~safe_set; + if (disallowed) { + names = btrfs_printable_features(set, disallowed); + if (names) { + btrfs_warn(fs_info, + "can't set the %s feature bit%s while mounted", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(fs_info, + "can't set %s bits 0x%llx while mounted", + type, disallowed); + return -EPERM; + } + + disallowed = clear_mask & ~safe_clear; + if (disallowed) { + names = btrfs_printable_features(set, disallowed); + if (names) { + btrfs_warn(fs_info, + "can't clear the %s feature bit%s while mounted", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(fs_info, + "can't clear %s bits 0x%llx while mounted", + type, disallowed); + return -EPERM; + } + + return 0; +} + +#define check_feature(fs_info, change_mask, flags, mask_base) \ +check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \ + BTRFS_FEATURE_ ## mask_base ## _SUPP, \ + BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \ + BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR) + +static int btrfs_ioctl_set_features(struct file *file, void __user *arg) +{ + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_super_block *super_block = fs_info->super_copy; + struct btrfs_ioctl_feature_flags flags[2]; + struct btrfs_trans_handle *trans; + u64 newflags; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(flags, arg, sizeof(flags))) + return -EFAULT; + + /* Nothing to do */ + if (!flags[0].compat_flags && !flags[0].compat_ro_flags && + !flags[0].incompat_flags) + return 0; + + ret = check_feature(fs_info, flags[0].compat_flags, + flags[1].compat_flags, COMPAT); + if (ret) + return ret; + + ret = check_feature(fs_info, flags[0].compat_ro_flags, + flags[1].compat_ro_flags, COMPAT_RO); + if (ret) + return ret; + + ret = check_feature(fs_info, flags[0].incompat_flags, + flags[1].incompat_flags, INCOMPAT); + if (ret) + return ret; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_drop_write; + } + + spin_lock(&fs_info->super_lock); + newflags = btrfs_super_compat_flags(super_block); + newflags |= flags[0].compat_flags & flags[1].compat_flags; + newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags); + btrfs_set_super_compat_flags(super_block, newflags); + + newflags = btrfs_super_compat_ro_flags(super_block); + newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags; + newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags); + btrfs_set_super_compat_ro_flags(super_block, newflags); + + newflags = btrfs_super_incompat_flags(super_block); + newflags |= flags[0].incompat_flags & flags[1].incompat_flags; + newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags); + btrfs_set_super_incompat_flags(super_block, newflags); + spin_unlock(&fs_info->super_lock); + + ret = btrfs_commit_transaction(trans); +out_drop_write: + mnt_drop_write_file(file); + + return ret; +} + +static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat) +{ + struct btrfs_ioctl_send_args *arg; + int ret; + + if (compat) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_send_args_32 args32 = { 0 }; + + ret = copy_from_user(&args32, argp, sizeof(args32)); + if (ret) + return -EFAULT; + arg = kzalloc(sizeof(*arg), GFP_KERNEL); + if (!arg) + return -ENOMEM; + arg->send_fd = args32.send_fd; + arg->clone_sources_count = args32.clone_sources_count; + arg->clone_sources = compat_ptr(args32.clone_sources); + arg->parent_root = args32.parent_root; + arg->flags = args32.flags; + arg->version = args32.version; + memcpy(arg->reserved, args32.reserved, + sizeof(args32.reserved)); +#else + return -ENOTTY; +#endif + } else { + arg = memdup_user(argp, sizeof(*arg)); + if (IS_ERR(arg)) + return PTR_ERR(arg); + } + ret = btrfs_ioctl_send(inode, arg); + kfree(arg); + return ret; +} + +static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, + bool compat) +{ + struct btrfs_ioctl_encoded_io_args args = { 0 }; + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, + flags); + size_t copy_end; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + loff_t pos; + struct kiocb kiocb; + ssize_t ret; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + if (compat) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, + flags); + if (copy_from_user(&args32, argp, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + args.iov = compat_ptr(args32.iov); + args.iovcnt = args32.iovcnt; + args.offset = args32.offset; + args.flags = args32.flags; +#else + return -ENOTTY; +#endif + } else { + copy_end = copy_end_kernel; + if (copy_from_user(&args, argp, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + if (args.flags != 0) { + ret = -EINVAL; + goto out_acct; + } + + ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + &iov, &iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&iter) == 0) { + ret = 0; + goto out_iov; + } + pos = args.offset; + ret = rw_verify_area(READ, file, &pos, args.len); + if (ret < 0) + goto out_iov; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = pos; + + ret = btrfs_encoded_read(&kiocb, &iter, &args); + if (ret >= 0) { + fsnotify_access(file); + if (copy_to_user(argp + copy_end, + (char *)&args + copy_end_kernel, + sizeof(args) - copy_end_kernel)) + ret = -EFAULT; + } + +out_iov: + kfree(iov); +out_acct: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + return ret; +} + +static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat) +{ + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + loff_t pos; + struct kiocb kiocb; + ssize_t ret; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EBADF; + goto out_acct; + } + + if (compat) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + if (copy_from_user(&args32, argp, sizeof(args32))) { + ret = -EFAULT; + goto out_acct; + } + args.iov = compat_ptr(args32.iov); + args.iovcnt = args32.iovcnt; + args.offset = args32.offset; + args.flags = args32.flags; + args.len = args32.len; + args.unencoded_len = args32.unencoded_len; + args.unencoded_offset = args32.unencoded_offset; + args.compression = args32.compression; + args.encryption = args32.encryption; + memcpy(args.reserved, args32.reserved, sizeof(args.reserved)); +#else + return -ENOTTY; +#endif + } else { + if (copy_from_user(&args, argp, sizeof(args))) { + ret = -EFAULT; + goto out_acct; + } + } + + ret = -EINVAL; + if (args.flags != 0) + goto out_acct; + if (memchr_inv(args.reserved, 0, sizeof(args.reserved))) + goto out_acct; + if (args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE && + args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE) + goto out_acct; + if (args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES || + args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES) + goto out_acct; + if (args.unencoded_offset > args.unencoded_len) + goto out_acct; + if (args.len > args.unencoded_len - args.unencoded_offset) + goto out_acct; + + ret = import_iovec(ITER_SOURCE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + &iov, &iter); + if (ret < 0) + goto out_acct; + + file_start_write(file); + + if (iov_iter_count(&iter) == 0) { + ret = 0; + goto out_end_write; + } + pos = args.offset; + ret = rw_verify_area(WRITE, file, &pos, args.len); + if (ret < 0) + goto out_end_write; + + init_sync_kiocb(&kiocb, file); + ret = kiocb_set_rw_flags(&kiocb, 0); + if (ret) + goto out_end_write; + kiocb.ki_pos = pos; + + ret = btrfs_do_write_iter(&kiocb, &iter, &args); + if (ret > 0) + fsnotify_modify(file); + +out_end_write: + file_end_write(file); + kfree(iov); +out_acct: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + +long btrfs_ioctl(struct file *file, unsigned int + cmd, unsigned long arg) +{ + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case FS_IOC_GETVERSION: + return btrfs_ioctl_getversion(inode, argp); + case FS_IOC_GETFSLABEL: + return btrfs_ioctl_get_fslabel(fs_info, argp); + case FS_IOC_SETFSLABEL: + return btrfs_ioctl_set_fslabel(file, argp); + case FITRIM: + return btrfs_ioctl_fitrim(fs_info, argp); + case BTRFS_IOC_SNAP_CREATE: + return btrfs_ioctl_snap_create(file, argp, 0); + case BTRFS_IOC_SNAP_CREATE_V2: + return btrfs_ioctl_snap_create_v2(file, argp, 0); + case BTRFS_IOC_SUBVOL_CREATE: + return btrfs_ioctl_snap_create(file, argp, 1); + case BTRFS_IOC_SUBVOL_CREATE_V2: + return btrfs_ioctl_snap_create_v2(file, argp, 1); + case BTRFS_IOC_SNAP_DESTROY: + return btrfs_ioctl_snap_destroy(file, argp, false); + case BTRFS_IOC_SNAP_DESTROY_V2: + return btrfs_ioctl_snap_destroy(file, argp, true); + case BTRFS_IOC_SUBVOL_GETFLAGS: + return btrfs_ioctl_subvol_getflags(inode, argp); + case BTRFS_IOC_SUBVOL_SETFLAGS: + return btrfs_ioctl_subvol_setflags(file, argp); + case BTRFS_IOC_DEFAULT_SUBVOL: + return btrfs_ioctl_default_subvol(file, argp); + case BTRFS_IOC_DEFRAG: + return btrfs_ioctl_defrag(file, NULL); + case BTRFS_IOC_DEFRAG_RANGE: + return btrfs_ioctl_defrag(file, argp); + case BTRFS_IOC_RESIZE: + return btrfs_ioctl_resize(file, argp); + case BTRFS_IOC_ADD_DEV: + return btrfs_ioctl_add_dev(fs_info, argp); + case BTRFS_IOC_RM_DEV: + return btrfs_ioctl_rm_dev(file, argp); + case BTRFS_IOC_RM_DEV_V2: + return btrfs_ioctl_rm_dev_v2(file, argp); + case BTRFS_IOC_FS_INFO: + return btrfs_ioctl_fs_info(fs_info, argp); + case BTRFS_IOC_DEV_INFO: + return btrfs_ioctl_dev_info(fs_info, argp); + case BTRFS_IOC_TREE_SEARCH: + return btrfs_ioctl_tree_search(inode, argp); + case BTRFS_IOC_TREE_SEARCH_V2: + return btrfs_ioctl_tree_search_v2(inode, argp); + case BTRFS_IOC_INO_LOOKUP: + return btrfs_ioctl_ino_lookup(root, argp); + case BTRFS_IOC_INO_PATHS: + return btrfs_ioctl_ino_to_path(root, argp); + case BTRFS_IOC_LOGICAL_INO: + return btrfs_ioctl_logical_to_ino(fs_info, argp, 1); + case BTRFS_IOC_LOGICAL_INO_V2: + return btrfs_ioctl_logical_to_ino(fs_info, argp, 2); + case BTRFS_IOC_SPACE_INFO: + return btrfs_ioctl_space_info(fs_info, argp); + case BTRFS_IOC_SYNC: { + int ret; + + ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); + if (ret) + return ret; + ret = btrfs_sync_fs(inode->i_sb, 1); + /* + * The transaction thread may want to do more work, + * namely it pokes the cleaner kthread that will start + * processing uncleaned subvols. + */ + wake_up_process(fs_info->transaction_kthread); + return ret; + } + case BTRFS_IOC_START_SYNC: + return btrfs_ioctl_start_sync(root, argp); + case BTRFS_IOC_WAIT_SYNC: + return btrfs_ioctl_wait_sync(fs_info, argp); + case BTRFS_IOC_SCRUB: + return btrfs_ioctl_scrub(file, argp); + case BTRFS_IOC_SCRUB_CANCEL: + return btrfs_ioctl_scrub_cancel(fs_info); + case BTRFS_IOC_SCRUB_PROGRESS: + return btrfs_ioctl_scrub_progress(fs_info, argp); + case BTRFS_IOC_BALANCE_V2: + return btrfs_ioctl_balance(file, argp); + case BTRFS_IOC_BALANCE_CTL: + return btrfs_ioctl_balance_ctl(fs_info, arg); + case BTRFS_IOC_BALANCE_PROGRESS: + return btrfs_ioctl_balance_progress(fs_info, argp); + case BTRFS_IOC_SET_RECEIVED_SUBVOL: + return btrfs_ioctl_set_received_subvol(file, argp); +#ifdef CONFIG_64BIT + case BTRFS_IOC_SET_RECEIVED_SUBVOL_32: + return btrfs_ioctl_set_received_subvol_32(file, argp); +#endif + case BTRFS_IOC_SEND: + return _btrfs_ioctl_send(inode, argp, false); +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_SEND_32: + return _btrfs_ioctl_send(inode, argp, true); +#endif + case BTRFS_IOC_GET_DEV_STATS: + return btrfs_ioctl_get_dev_stats(fs_info, argp); + case BTRFS_IOC_QUOTA_CTL: + return btrfs_ioctl_quota_ctl(file, argp); + case BTRFS_IOC_QGROUP_ASSIGN: + return btrfs_ioctl_qgroup_assign(file, argp); + case BTRFS_IOC_QGROUP_CREATE: + return btrfs_ioctl_qgroup_create(file, argp); + case BTRFS_IOC_QGROUP_LIMIT: + return btrfs_ioctl_qgroup_limit(file, argp); + case BTRFS_IOC_QUOTA_RESCAN: + return btrfs_ioctl_quota_rescan(file, argp); + case BTRFS_IOC_QUOTA_RESCAN_STATUS: + return btrfs_ioctl_quota_rescan_status(fs_info, argp); + case BTRFS_IOC_QUOTA_RESCAN_WAIT: + return btrfs_ioctl_quota_rescan_wait(fs_info, argp); + case BTRFS_IOC_DEV_REPLACE: + return btrfs_ioctl_dev_replace(fs_info, argp); + case BTRFS_IOC_GET_SUPPORTED_FEATURES: + return btrfs_ioctl_get_supported_features(argp); + case BTRFS_IOC_GET_FEATURES: + return btrfs_ioctl_get_features(fs_info, argp); + case BTRFS_IOC_SET_FEATURES: + return btrfs_ioctl_set_features(file, argp); + case BTRFS_IOC_GET_SUBVOL_INFO: + return btrfs_ioctl_get_subvol_info(inode, argp); + case BTRFS_IOC_GET_SUBVOL_ROOTREF: + return btrfs_ioctl_get_subvol_rootref(root, argp); + case BTRFS_IOC_INO_LOOKUP_USER: + return btrfs_ioctl_ino_lookup_user(file, argp); + case FS_IOC_ENABLE_VERITY: + return fsverity_ioctl_enable(file, (const void __user *)argp); + case FS_IOC_MEASURE_VERITY: + return fsverity_ioctl_measure(file, argp); + case BTRFS_IOC_ENCODED_READ: + return btrfs_ioctl_encoded_read(file, argp, false); + case BTRFS_IOC_ENCODED_WRITE: + return btrfs_ioctl_encoded_write(file, argp, false); +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_READ_32: + return btrfs_ioctl_encoded_read(file, argp, true); + case BTRFS_IOC_ENCODED_WRITE_32: + return btrfs_ioctl_encoded_write(file, argp, true); +#endif + } + + return -ENOTTY; +} + +#ifdef CONFIG_COMPAT +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + /* + * These all access 32-bit values anyway so no further + * handling is necessary. + */ + switch (cmd) { + case FS_IOC32_GETVERSION: + cmd = FS_IOC_GETVERSION; + break; + } + + return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h new file mode 100644 index 0000000000..d51b9a2f2f --- /dev/null +++ b/fs/btrfs/ioctl.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_IOCTL_H +#define BTRFS_IOCTL_H + +long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int btrfs_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct fileattr *fa); +int btrfs_ioctl_get_supported_features(void __user *arg); +void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); +int __pure btrfs_is_empty_uuid(u8 *uuid); +void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_balance_args *bargs); + +#endif diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c new file mode 100644 index 0000000000..7979449a58 --- /dev/null +++ b/fs/btrfs/locking.c @@ -0,0 +1,393 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2008 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include "misc.h" +#include "ctree.h" +#include "extent_io.h" +#include "locking.h" +#include "accessors.h" + +/* + * Lockdep class keys for extent_buffer->lock's in this root. For a given + * eb, the lockdep key is determined by the btrfs_root it belongs to and + * the level the eb occupies in the tree. + * + * Different roots are used for different purposes and may nest inside each + * other and they require separate keysets. As lockdep keys should be + * static, assign keysets according to the purpose of the root as indicated + * by btrfs_root->root_key.objectid. This ensures that all special purpose + * roots have separate keysets. + * + * Lock-nesting across peer nodes is always done with the immediate parent + * node locked thus preventing deadlock. As lockdep doesn't know this, use + * subclass to avoid triggering lockdep warning in such cases. + * + * The key is set by the readpage_end_io_hook after the buffer has passed + * csum validation but before the pages are unlocked. It is also set by + * btrfs_init_new_buffer on freshly allocated blocks. + * + * We also add a check to make sure the highest level of the tree is the + * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code + * needs update as well. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#if BTRFS_MAX_LEVEL != 8 +#error +#endif + +#define DEFINE_LEVEL(stem, level) \ + .names[level] = "btrfs-" stem "-0" #level, + +#define DEFINE_NAME(stem) \ + DEFINE_LEVEL(stem, 0) \ + DEFINE_LEVEL(stem, 1) \ + DEFINE_LEVEL(stem, 2) \ + DEFINE_LEVEL(stem, 3) \ + DEFINE_LEVEL(stem, 4) \ + DEFINE_LEVEL(stem, 5) \ + DEFINE_LEVEL(stem, 6) \ + DEFINE_LEVEL(stem, 7) + +static struct btrfs_lockdep_keyset { + u64 id; /* root objectid */ + /* Longest entry: btrfs-block-group-00 */ + char names[BTRFS_MAX_LEVEL][24]; + struct lock_class_key keys[BTRFS_MAX_LEVEL]; +} btrfs_lockdep_keysets[] = { + { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") }, + { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") }, + { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") }, + { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") }, + { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") }, + { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") }, + { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") }, + { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") }, + { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") }, + { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, + { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, + { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") }, + { .id = 0, DEFINE_NAME("tree") }, +}; + +#undef DEFINE_LEVEL +#undef DEFINE_NAME + +void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level) +{ + struct btrfs_lockdep_keyset *ks; + + BUG_ON(level >= ARRAY_SIZE(ks->keys)); + + /* Find the matching keyset, id 0 is the default entry */ + for (ks = btrfs_lockdep_keysets; ks->id; ks++) + if (ks->id == objectid) + break; + + lockdep_set_class_and_name(&eb->lock, &ks->keys[level], ks->names[level]); +} + +void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb) +{ + if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) + btrfs_set_buffer_lockdep_class(root->root_key.objectid, + eb, btrfs_header_level(eb)); +} + +#endif + +/* + * Extent buffer locking + * ===================== + * + * We use a rw_semaphore for tree locking, and the semantics are exactly the + * same: + * + * - reader/writer exclusion + * - writer/writer exclusion + * - reader/reader sharing + * - try-lock semantics for readers and writers + * + * The rwsem implementation does opportunistic spinning which reduces number of + * times the locking task needs to sleep. + */ + +/* + * __btrfs_tree_read_lock - lock extent buffer for read + * @eb: the eb to be locked + * @nest: the nesting level to be used for lockdep + * + * This takes the read lock on the extent buffer, using the specified nesting + * level for lockdep purposes. + */ +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) +{ + u64 start_ns = 0; + + if (trace_btrfs_tree_read_lock_enabled()) + start_ns = ktime_get_ns(); + + down_read_nested(&eb->lock, nest); + trace_btrfs_tree_read_lock(eb, start_ns); +} + +void btrfs_tree_read_lock(struct extent_buffer *eb) +{ + __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL); +} + +/* + * Try-lock for read. + * + * Return 1 if the rwlock has been taken, 0 otherwise + */ +int btrfs_try_tree_read_lock(struct extent_buffer *eb) +{ + if (down_read_trylock(&eb->lock)) { + trace_btrfs_try_tree_read_lock(eb); + return 1; + } + return 0; +} + +/* + * Try-lock for write. + * + * Return 1 if the rwlock has been taken, 0 otherwise + */ +int btrfs_try_tree_write_lock(struct extent_buffer *eb) +{ + if (down_write_trylock(&eb->lock)) { + eb->lock_owner = current->pid; + trace_btrfs_try_tree_write_lock(eb); + return 1; + } + return 0; +} + +/* + * Release read lock. + */ +void btrfs_tree_read_unlock(struct extent_buffer *eb) +{ + trace_btrfs_tree_read_unlock(eb); + up_read(&eb->lock); +} + +/* + * __btrfs_tree_lock - lock eb for write + * @eb: the eb to lock + * @nest: the nesting to use for the lock + * + * Returns with the eb->lock write locked. + */ +void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) + __acquires(&eb->lock) +{ + u64 start_ns = 0; + + if (trace_btrfs_tree_lock_enabled()) + start_ns = ktime_get_ns(); + + down_write_nested(&eb->lock, nest); + eb->lock_owner = current->pid; + trace_btrfs_tree_lock(eb, start_ns); +} + +void btrfs_tree_lock(struct extent_buffer *eb) +{ + __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL); +} + +/* + * Release the write lock. + */ +void btrfs_tree_unlock(struct extent_buffer *eb) +{ + trace_btrfs_tree_unlock(eb); + eb->lock_owner = 0; + up_write(&eb->lock); +} + +/* + * This releases any locks held in the path starting at level and going all the + * way up to the root. + * + * btrfs_search_slot will keep the lock held on higher nodes in a few corner + * cases, such as COW of the block at slot zero in the node. This ignores + * those rules, and it should only be called when there are no more updates to + * be done higher up in the tree. + */ +void btrfs_unlock_up_safe(struct btrfs_path *path, int level) +{ + int i; + + if (path->keep_locks) + return; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + continue; + if (!path->locks[i]) + continue; + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); + path->locks[i] = 0; + } +} + +/* + * Loop around taking references on and locking the root node of the tree until + * we end up with a lock on the root node. + * + * Return: root extent buffer with write lock held + */ +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + + btrfs_maybe_reset_lockdep_class(root, eb); + btrfs_tree_lock(eb); + if (eb == root->node) + break; + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* + * Loop around taking references on and locking the root node of the tree until + * we end up with a lock on the root node. + * + * Return: root extent buffer with read lock held + */ +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + + btrfs_maybe_reset_lockdep_class(root, eb); + btrfs_tree_read_lock(eb); + if (eb == root->node) + break; + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* + * Loop around taking references on and locking the root node of the tree in + * nowait mode until we end up with a lock on the root node or returning to + * avoid blocking. + * + * Return: root extent buffer with read lock held or -EAGAIN. + */ +struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + if (!btrfs_try_tree_read_lock(eb)) { + free_extent_buffer(eb); + return ERR_PTR(-EAGAIN); + } + if (eb == root->node) + break; + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* + * DREW locks + * ========== + * + * DREW stands for double-reader-writer-exclusion lock. It's used in situation + * where you want to provide A-B exclusion but not AA or BB. + * + * Currently implementation gives more priority to reader. If a reader and a + * writer both race to acquire their respective sides of the lock the writer + * would yield its lock as soon as it detects a concurrent reader. Additionally + * if there are pending readers no new writers would be allowed to come in and + * acquire the lock. + */ + +void btrfs_drew_lock_init(struct btrfs_drew_lock *lock) +{ + atomic_set(&lock->readers, 0); + atomic_set(&lock->writers, 0); + init_waitqueue_head(&lock->pending_readers); + init_waitqueue_head(&lock->pending_writers); +} + +/* Return true if acquisition is successful, false otherwise */ +bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock) +{ + if (atomic_read(&lock->readers)) + return false; + + atomic_inc(&lock->writers); + + /* Ensure writers count is updated before we check for pending readers */ + smp_mb__after_atomic(); + if (atomic_read(&lock->readers)) { + btrfs_drew_write_unlock(lock); + return false; + } + + return true; +} + +void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) +{ + while (true) { + if (btrfs_drew_try_write_lock(lock)) + return; + wait_event(lock->pending_writers, !atomic_read(&lock->readers)); + } +} + +void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) +{ + atomic_dec(&lock->writers); + cond_wake_up(&lock->pending_readers); +} + +void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) +{ + atomic_inc(&lock->readers); + + /* + * Ensure the pending reader count is perceieved BEFORE this reader + * goes to sleep in case of active writers. This guarantees new writers + * won't be allowed and that the current reader will be woken up when + * the last active writer finishes its jobs. + */ + smp_mb__after_atomic(); + + wait_event(lock->pending_readers, atomic_read(&lock->writers) == 0); +} + +void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock) +{ + /* + * atomic_dec_and_test implies a full barrier, so woken up writers + * are guaranteed to see the decrement + */ + if (atomic_dec_and_test(&lock->readers)) + wake_up(&lock->pending_writers); +} diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h new file mode 100644 index 0000000000..7d6ee1e609 --- /dev/null +++ b/fs/btrfs/locking.h @@ -0,0 +1,224 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + */ + +#ifndef BTRFS_LOCKING_H +#define BTRFS_LOCKING_H + +#include +#include +#include +#include "extent_io.h" + +#define BTRFS_WRITE_LOCK 1 +#define BTRFS_READ_LOCK 2 + +/* + * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at + * the time of this patch is 8, which is how many we use. Keep this in mind if + * you decide you want to add another subclass. + */ +enum btrfs_lock_nesting { + BTRFS_NESTING_NORMAL, + + /* + * When we COW a block we are holding the lock on the original block, + * and since our lockdep maps are rootid+level, this confuses lockdep + * when we lock the newly allocated COW'd block. Handle this by having + * a subclass for COW'ed blocks so that lockdep doesn't complain. + */ + BTRFS_NESTING_COW, + + /* + * Oftentimes we need to lock adjacent nodes on the same level while + * still holding the lock on the original node we searched to, such as + * for searching forward or for split/balance. + * + * Because of this we need to indicate to lockdep that this is + * acceptable by having a different subclass for each of these + * operations. + */ + BTRFS_NESTING_LEFT, + BTRFS_NESTING_RIGHT, + + /* + * When splitting we will be holding a lock on the left/right node when + * we need to cow that node, thus we need a new set of subclasses for + * these two operations. + */ + BTRFS_NESTING_LEFT_COW, + BTRFS_NESTING_RIGHT_COW, + + /* + * When splitting we may push nodes to the left or right, but still use + * the subsequent nodes in our path, keeping our locks on those adjacent + * blocks. Thus when we go to allocate a new split block we've already + * used up all of our available subclasses, so this subclass exists to + * handle this case where we need to allocate a new split block. + */ + BTRFS_NESTING_SPLIT, + + /* + * When promoting a new block to a root we need to have a special + * subclass so we don't confuse lockdep, as it will appear that we are + * locking a higher level node before a lower level one. Copying also + * has this problem as it appears we're locking the same block again + * when we make a snapshot of an existing root. + */ + BTRFS_NESTING_NEW_ROOT, + + /* + * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so + * add this in here and add a static_assert to keep us from going over + * the limit. As of this writing we're limited to 8, and we're + * definitely using 8, hence this check to keep us from messing up in + * the future. + */ + BTRFS_NESTING_MAX, +}; + +enum btrfs_lockdep_trans_states { + BTRFS_LOCKDEP_TRANS_COMMIT_PREP, + BTRFS_LOCKDEP_TRANS_UNBLOCKED, + BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED, + BTRFS_LOCKDEP_TRANS_COMPLETED, +}; + +/* + * Lockdep annotation for wait events. + * + * @owner: The struct where the lockdep map is defined + * @lock: The lockdep map corresponding to a wait event + * + * This macro is used to annotate a wait event. In this case a thread acquires + * the lockdep map as writer (exclusive lock) because it has to block until all + * the threads that hold the lock as readers signal the condition for the wait + * event and release their locks. + */ +#define btrfs_might_wait_for_event(owner, lock) \ + do { \ + rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \ + rwsem_release(&owner->lock##_map, _THIS_IP_); \ + } while (0) + +/* + * Protection for the resource/condition of a wait event. + * + * @owner: The struct where the lockdep map is defined + * @lock: The lockdep map corresponding to a wait event + * + * Many threads can modify the condition for the wait event at the same time + * and signal the threads that block on the wait event. The threads that modify + * the condition and do the signaling acquire the lock as readers (shared + * lock). + */ +#define btrfs_lockdep_acquire(owner, lock) \ + rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_) + +/* + * Used after signaling the condition for a wait event to release the lockdep + * map held by a reader thread. + */ +#define btrfs_lockdep_release(owner, lock) \ + rwsem_release(&owner->lock##_map, _THIS_IP_) + +/* + * Macros for the transaction states wait events, similar to the generic wait + * event macros. + */ +#define btrfs_might_wait_for_state(owner, i) \ + do { \ + rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \ + rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \ + } while (0) + +#define btrfs_trans_state_lockdep_acquire(owner, i) \ + rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_) + +#define btrfs_trans_state_lockdep_release(owner, i) \ + rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_) + +/* Initialization of the lockdep map */ +#define btrfs_lockdep_init_map(owner, lock) \ + do { \ + static struct lock_class_key lock##_key; \ + lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \ + } while (0) + +/* Initialization of the transaction states lockdep maps. */ +#define btrfs_state_lockdep_init_map(owner, lock, state) \ + do { \ + static struct lock_class_key lock##_key; \ + lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \ + &lock##_key, 0); \ + } while (0) + +static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, + "too many lock subclasses defined"); + +struct btrfs_path; + +void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); +void btrfs_tree_lock(struct extent_buffer *eb); +void btrfs_tree_unlock(struct extent_buffer *eb); + +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); +void btrfs_tree_read_lock(struct extent_buffer *eb); +void btrfs_tree_read_unlock(struct extent_buffer *eb); +int btrfs_try_tree_read_lock(struct extent_buffer *eb); +int btrfs_try_tree_write_lock(struct extent_buffer *eb); +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); +struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); + +#ifdef CONFIG_BTRFS_DEBUG +static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) +{ + lockdep_assert_held_write(&eb->lock); +} +#else +static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { } +#endif + +void btrfs_unlock_up_safe(struct btrfs_path *path, int level); + +static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) +{ + if (rw == BTRFS_WRITE_LOCK) + btrfs_tree_unlock(eb); + else if (rw == BTRFS_READ_LOCK) + btrfs_tree_read_unlock(eb); + else + BUG(); +} + +struct btrfs_drew_lock { + atomic_t readers; + atomic_t writers; + wait_queue_head_t pending_writers; + wait_queue_head_t pending_readers; +}; + +void btrfs_drew_lock_init(struct btrfs_drew_lock *lock); +void btrfs_drew_write_lock(struct btrfs_drew_lock *lock); +bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock); +void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock); +void btrfs_drew_read_lock(struct btrfs_drew_lock *lock); +void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level); +void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb); +#else +static inline void btrfs_set_buffer_lockdep_class(u64 objectid, + struct extent_buffer *eb, int level) +{ +} +static inline void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, + struct extent_buffer *eb) +{ +} +#endif + +#endif diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c new file mode 100644 index 0000000000..0fe0ae54ac --- /dev/null +++ b/fs/btrfs/lru_cache.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "lru_cache.h" +#include "messages.h" + +/* + * Initialize a cache object. + * + * @cache: The cache. + * @max_size: Maximum size (number of entries) for the cache. + * Use 0 for unlimited size, it's the user's responsability to + * trim the cache in that case. + */ +void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size) +{ + INIT_LIST_HEAD(&cache->lru_list); + mt_init(&cache->entries); + cache->size = 0; + cache->max_size = max_size; +} + +static struct btrfs_lru_cache_entry *match_entry(struct list_head *head, u64 key, + u64 gen) +{ + struct btrfs_lru_cache_entry *entry; + + list_for_each_entry(entry, head, list) { + if (entry->key == key && entry->gen == gen) + return entry; + } + + return NULL; +} + +/* + * Lookup for an entry in the cache. + * + * @cache: The cache. + * @key: The key of the entry we are looking for. + * @gen: Generation associated to the key. + * + * Returns the entry associated with the key or NULL if none found. + */ +struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache, + u64 key, u64 gen) +{ + struct list_head *head; + struct btrfs_lru_cache_entry *entry; + + head = mtree_load(&cache->entries, key); + if (!head) + return NULL; + + entry = match_entry(head, key, gen); + if (entry) + list_move_tail(&entry->lru_list, &cache->lru_list); + + return entry; +} + +/* + * Remove an entry from the cache. + * + * @cache: The cache to remove from. + * @entry: The entry to remove from the cache. + * + * Note: this also frees the memory used by the entry. + */ +void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *entry) +{ + struct list_head *prev = entry->list.prev; + + ASSERT(cache->size > 0); + ASSERT(!mtree_empty(&cache->entries)); + + list_del(&entry->list); + list_del(&entry->lru_list); + + if (list_empty(prev)) { + struct list_head *head; + + /* + * If previous element in the list entry->list is now empty, it + * means it's a head entry not pointing to any cached entries, + * so remove it from the maple tree and free it. + */ + head = mtree_erase(&cache->entries, entry->key); + ASSERT(head == prev); + kfree(head); + } + + kfree(entry); + cache->size--; +} + +/* + * Store an entry in the cache. + * + * @cache: The cache. + * @entry: The entry to store. + * + * Returns 0 on success and < 0 on error. + */ +int btrfs_lru_cache_store(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *new_entry, + gfp_t gfp) +{ + const u64 key = new_entry->key; + struct list_head *head; + int ret; + + head = kmalloc(sizeof(*head), gfp); + if (!head) + return -ENOMEM; + + ret = mtree_insert(&cache->entries, key, head, gfp); + if (ret == 0) { + INIT_LIST_HEAD(head); + list_add_tail(&new_entry->list, head); + } else if (ret == -EEXIST) { + kfree(head); + head = mtree_load(&cache->entries, key); + ASSERT(head != NULL); + if (match_entry(head, key, new_entry->gen) != NULL) + return -EEXIST; + list_add_tail(&new_entry->list, head); + } else if (ret < 0) { + kfree(head); + return ret; + } + + if (cache->max_size > 0 && cache->size == cache->max_size) { + struct btrfs_lru_cache_entry *lru_entry; + + lru_entry = list_first_entry(&cache->lru_list, + struct btrfs_lru_cache_entry, + lru_list); + btrfs_lru_cache_remove(cache, lru_entry); + } + + list_add_tail(&new_entry->lru_list, &cache->lru_list); + cache->size++; + + return 0; +} + +/* + * Empty a cache. + * + * @cache: The cache to empty. + * + * Removes all entries from the cache. + */ +void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache) +{ + struct btrfs_lru_cache_entry *entry; + struct btrfs_lru_cache_entry *tmp; + + list_for_each_entry_safe(entry, tmp, &cache->lru_list, lru_list) + btrfs_lru_cache_remove(cache, entry); + + ASSERT(cache->size == 0); + ASSERT(mtree_empty(&cache->entries)); +} diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h new file mode 100644 index 0000000000..00328c856b --- /dev/null +++ b/fs/btrfs/lru_cache.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_LRU_CACHE_H +#define BTRFS_LRU_CACHE_H + +#include +#include + +/* + * A cache entry. This is meant to be embedded in a structure of a user of + * this module. Similar to how struct list_head and struct rb_node are used. + * + * Note: it should be embedded as the first element in a struct (offset 0), and + * this module assumes it was allocated with kmalloc(), so it calls kfree() when + * it needs to free an entry. + */ +struct btrfs_lru_cache_entry { + struct list_head lru_list; + u64 key; + /* + * Optional generation associated to a key. Use 0 if not needed/used. + * Entries with the same key and different generations are stored in a + * linked list, so use this only for cases where there's a small number + * of different generations. + */ + u64 gen; + /* + * The maple tree uses unsigned long type for the keys, which is 32 bits + * on 32 bits systems, and 64 bits on 64 bits systems. So if we want to + * use something like inode numbers as keys, which are always a u64, we + * have to deal with this in a special way - we store the key in the + * entry itself, as a u64, and the values inserted into the maple tree + * are linked lists of entries - so in case we are on a 64 bits system, + * that list always has a single entry, while on 32 bits systems it + * may have more than one, with each entry having the same value for + * their lower 32 bits of the u64 key. + */ + struct list_head list; +}; + +struct btrfs_lru_cache { + struct list_head lru_list; + struct maple_tree entries; + /* Number of entries stored in the cache. */ + unsigned int size; + /* Maximum number of entries the cache can have. */ + unsigned int max_size; +}; + +#define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \ + list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list) + +static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache) +{ + return cache->size; +} + +static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry( + struct btrfs_lru_cache *cache) +{ + return list_first_entry_or_null(&cache->lru_list, + struct btrfs_lru_cache_entry, lru_list); +} + +void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size); +struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache, + u64 key, u64 gen); +int btrfs_lru_cache_store(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *new_entry, + gfp_t gfp); +void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *entry); +void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache); + +#endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c new file mode 100644 index 0000000000..d3fcfc628a --- /dev/null +++ b/fs/btrfs/lzo.c @@ -0,0 +1,493 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2008 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "messages.h" +#include "compression.h" +#include "ctree.h" +#include "super.h" +#include "btrfs_inode.h" + +#define LZO_LEN 4 + +/* + * Btrfs LZO compression format + * + * Regular and inlined LZO compressed data extents consist of: + * + * 1. Header + * Fixed size. LZO_LEN (4) bytes long, LE32. + * Records the total size (including the header) of compressed data. + * + * 2. Segment(s) + * Variable size. Each segment includes one segment header, followed by data + * payload. + * One regular LZO compressed extent can have one or more segments. + * For inlined LZO compressed extent, only one segment is allowed. + * One segment represents at most one sector of uncompressed data. + * + * 2.1 Segment header + * Fixed size. LZO_LEN (4) bytes long, LE32. + * Records the total size of the segment (not including the header). + * Segment header never crosses sector boundary, thus it's possible to + * have at most 3 padding zeros at the end of the sector. + * + * 2.2 Data Payload + * Variable size. Size up limit should be lzo1x_worst_compress(sectorsize) + * which is 4419 for a 4KiB sectorsize. + * + * Example with 4K sectorsize: + * Page 1: + * 0 0x2 0x4 0x6 0x8 0xa 0xc 0xe 0x10 + * 0x0000 | Header | SegHdr 01 | Data payload 01 ... | + * ... + * 0x0ff0 | SegHdr N | Data payload N ... |00| + * ^^ padding zeros + * Page 2: + * 0x1000 | SegHdr N+1| Data payload N+1 ... | + */ + +#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) +#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) + +struct workspace { + void *mem; + void *buf; /* where decompressed data goes */ + void *cbuf; /* where compressed data goes */ + struct list_head list; +}; + +static struct workspace_manager wsm; + +void lzo_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + + kvfree(workspace->buf); + kvfree(workspace->cbuf); + kvfree(workspace->mem); + kfree(workspace); +} + +struct list_head *lzo_alloc_workspace(unsigned int level) +{ + struct workspace *workspace; + + workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); + if (!workspace) + return ERR_PTR(-ENOMEM); + + workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL | __GFP_NOWARN); + workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL | __GFP_NOWARN); + workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL | __GFP_NOWARN); + if (!workspace->mem || !workspace->buf || !workspace->cbuf) + goto fail; + + INIT_LIST_HEAD(&workspace->list); + + return &workspace->list; +fail: + lzo_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); +} + +static inline void write_compress_length(char *buf, size_t len) +{ + __le32 dlen; + + dlen = cpu_to_le32(len); + memcpy(buf, &dlen, LZO_LEN); +} + +static inline size_t read_compress_length(const char *buf) +{ + __le32 dlen; + + memcpy(&dlen, buf, LZO_LEN); + return le32_to_cpu(dlen); +} + +/* + * Will do: + * + * - Write a segment header into the destination + * - Copy the compressed buffer into the destination + * - Make sure we have enough space in the last sector to fit a segment header + * If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros. + * + * Will allocate new pages when needed. + */ +static int copy_compressed_data_to_page(char *compressed_data, + size_t compressed_size, + struct page **out_pages, + unsigned long max_nr_page, + u32 *cur_out, + const u32 sectorsize) +{ + u32 sector_bytes_left; + u32 orig_out; + struct page *cur_page; + char *kaddr; + + if ((*cur_out / PAGE_SIZE) >= max_nr_page) + return -E2BIG; + + /* + * We never allow a segment header crossing sector boundary, previous + * run should ensure we have enough space left inside the sector. + */ + ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize); + + cur_page = out_pages[*cur_out / PAGE_SIZE]; + /* Allocate a new page */ + if (!cur_page) { + cur_page = alloc_page(GFP_NOFS); + if (!cur_page) + return -ENOMEM; + out_pages[*cur_out / PAGE_SIZE] = cur_page; + } + + kaddr = kmap_local_page(cur_page); + write_compress_length(kaddr + offset_in_page(*cur_out), + compressed_size); + *cur_out += LZO_LEN; + + orig_out = *cur_out; + + /* Copy compressed data */ + while (*cur_out - orig_out < compressed_size) { + u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize, + orig_out + compressed_size - *cur_out); + + kunmap_local(kaddr); + + if ((*cur_out / PAGE_SIZE) >= max_nr_page) + return -E2BIG; + + cur_page = out_pages[*cur_out / PAGE_SIZE]; + /* Allocate a new page */ + if (!cur_page) { + cur_page = alloc_page(GFP_NOFS); + if (!cur_page) + return -ENOMEM; + out_pages[*cur_out / PAGE_SIZE] = cur_page; + } + kaddr = kmap_local_page(cur_page); + + memcpy(kaddr + offset_in_page(*cur_out), + compressed_data + *cur_out - orig_out, copy_len); + + *cur_out += copy_len; + } + + /* + * Check if we can fit the next segment header into the remaining space + * of the sector. + */ + sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out; + if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0) + goto out; + + /* The remaining size is not enough, pad it with zeros */ + memset(kaddr + offset_in_page(*cur_out), 0, + sector_bytes_left); + *cur_out += sector_bytes_left; + +out: + kunmap_local(kaddr); + return 0; +} + +int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize; + struct page *page_in = NULL; + char *sizes_ptr; + const unsigned long max_nr_page = *out_pages; + int ret = 0; + /* Points to the file offset of input data */ + u64 cur_in = start; + /* Points to the current output byte */ + u32 cur_out = 0; + u32 len = *total_out; + + ASSERT(max_nr_page > 0); + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + /* + * Skip the header for now, we will later come back and write the total + * compressed size + */ + cur_out += LZO_LEN; + while (cur_in < start + len) { + char *data_in; + const u32 sectorsize_mask = sectorsize - 1; + u32 sector_off = (cur_in - start) & sectorsize_mask; + u32 in_len; + size_t out_len; + + /* Get the input page first */ + if (!page_in) { + page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT); + ASSERT(page_in); + } + + /* Compress at most one sector of data each time */ + in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); + ASSERT(in_len); + data_in = kmap_local_page(page_in); + ret = lzo1x_1_compress(data_in + + offset_in_page(cur_in), in_len, + workspace->cbuf, &out_len, + workspace->mem); + kunmap_local(data_in); + if (ret < 0) { + pr_debug("BTRFS: lzo in loop returned %d\n", ret); + ret = -EIO; + goto out; + } + + ret = copy_compressed_data_to_page(workspace->cbuf, out_len, + pages, max_nr_page, + &cur_out, sectorsize); + if (ret < 0) + goto out; + + cur_in += in_len; + + /* + * Check if we're making it bigger after two sectors. And if + * it is so, give up. + */ + if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) { + ret = -E2BIG; + goto out; + } + + /* Check if we have reached page boundary */ + if (PAGE_ALIGNED(cur_in)) { + put_page(page_in); + page_in = NULL; + } + } + + /* Store the size of all chunks of compressed data */ + sizes_ptr = kmap_local_page(pages[0]); + write_compress_length(sizes_ptr, cur_out); + kunmap_local(sizes_ptr); + + ret = 0; + *total_out = cur_out; + *total_in = cur_in - start; +out: + if (page_in) + put_page(page_in); + *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE); + return ret; +} + +/* + * Copy the compressed segment payload into @dest. + * + * For the payload there will be no padding, just need to do page switching. + */ +static void copy_compressed_segment(struct compressed_bio *cb, + char *dest, u32 len, u32 *cur_in) +{ + u32 orig_in = *cur_in; + + while (*cur_in < orig_in + len) { + struct page *cur_page; + u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), + orig_in + len - *cur_in); + + ASSERT(copy_len); + cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE]; + + memcpy_from_page(dest + *cur_in - orig_in, cur_page, + offset_in_page(*cur_in), copy_len); + + *cur_in += copy_len; + } +} + +int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info; + const u32 sectorsize = fs_info->sectorsize; + char *kaddr; + int ret; + /* Compressed data length, can be unaligned */ + u32 len_in; + /* Offset inside the compressed data */ + u32 cur_in = 0; + /* Bytes decompressed so far */ + u32 cur_out = 0; + + kaddr = kmap_local_page(cb->compressed_pages[0]); + len_in = read_compress_length(kaddr); + kunmap_local(kaddr); + cur_in += LZO_LEN; + + /* + * LZO header length check + * + * The total length should not exceed the maximum extent length, + * and all sectors should be used. + * If this happens, it means the compressed extent is corrupted. + */ + if (len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) || + round_up(len_in, sectorsize) < cb->compressed_len) { + btrfs_err(fs_info, + "invalid lzo header, lzo len %u compressed len %u", + len_in, cb->compressed_len); + return -EUCLEAN; + } + + /* Go through each lzo segment */ + while (cur_in < len_in) { + struct page *cur_page; + /* Length of the compressed segment */ + u32 seg_len; + u32 sector_bytes_left; + size_t out_len = lzo1x_worst_compress(sectorsize); + + /* + * We should always have enough space for one segment header + * inside current sector. + */ + ASSERT(cur_in / sectorsize == + (cur_in + LZO_LEN - 1) / sectorsize); + cur_page = cb->compressed_pages[cur_in / PAGE_SIZE]; + ASSERT(cur_page); + kaddr = kmap_local_page(cur_page); + seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); + kunmap_local(kaddr); + cur_in += LZO_LEN; + + if (seg_len > WORKSPACE_CBUF_LENGTH) { + /* + * seg_len shouldn't be larger than we have allocated + * for workspace->cbuf + */ + btrfs_err(fs_info, "unexpectedly large lzo segment len %u", + seg_len); + return -EIO; + } + + /* Copy the compressed segment payload into workspace */ + copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in); + + /* Decompress the data */ + ret = lzo1x_decompress_safe(workspace->cbuf, seg_len, + workspace->buf, &out_len); + if (ret != LZO_E_OK) { + btrfs_err(fs_info, "failed to decompress"); + return -EIO; + } + + /* Copy the data into inode pages */ + ret = btrfs_decompress_buf2page(workspace->buf, out_len, cb, cur_out); + cur_out += out_len; + + /* All data read, exit */ + if (ret == 0) + return 0; + ret = 0; + + /* Check if the sector has enough space for a segment header */ + sector_bytes_left = sectorsize - (cur_in % sectorsize); + if (sector_bytes_left >= LZO_LEN) + continue; + + /* Skip the padding zeros */ + cur_in += sector_bytes_left; + } + + return 0; +} + +int lzo_decompress(struct list_head *ws, const u8 *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + size_t in_len; + size_t out_len; + size_t max_segment_len = WORKSPACE_BUF_LENGTH; + int ret = 0; + char *kaddr; + unsigned long bytes; + + if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2) + return -EUCLEAN; + + in_len = read_compress_length(data_in); + if (in_len != srclen) + return -EUCLEAN; + data_in += LZO_LEN; + + in_len = read_compress_length(data_in); + if (in_len != srclen - LZO_LEN * 2) { + ret = -EUCLEAN; + goto out; + } + data_in += LZO_LEN; + + out_len = PAGE_SIZE; + ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); + if (ret != LZO_E_OK) { + pr_warn("BTRFS: decompress failed!\n"); + ret = -EIO; + goto out; + } + + if (out_len < start_byte) { + ret = -EIO; + goto out; + } + + /* + * the caller is already checking against PAGE_SIZE, but lets + * move this check closer to the memcpy/memset + */ + destlen = min_t(unsigned long, destlen, PAGE_SIZE); + bytes = min_t(unsigned long, destlen, out_len - start_byte); + + kaddr = kmap_local_page(dest_page); + memcpy(kaddr, workspace->buf + start_byte, bytes); + + /* + * btrfs_getblock is doing a zero on the tail of the page too, + * but this will cover anything missing from the decompressed + * data. + */ + if (bytes < destlen) + memset(kaddr+bytes, 0, destlen-bytes); + kunmap_local(kaddr); +out: + return ret; +} + +const struct btrfs_compress_op btrfs_lzo_compress = { + .workspace_manager = &wsm, + .max_level = 1, + .default_level = 1, +}; diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c new file mode 100644 index 0000000000..7695decc72 --- /dev/null +++ b/fs/btrfs/messages.c @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "fs.h" +#include "messages.h" +#include "discard.h" +#include "transaction.h" +#include "space-info.h" +#include "super.h" + +#ifdef CONFIG_PRINTK + +#define STATE_STRING_PREFACE ": state " +#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT + 1) + +/* + * Characters to print to indicate error conditions or uncommon filesystem state. + * RO is not an error. + */ +static const char fs_state_chars[] = { + [BTRFS_FS_STATE_REMOUNTING] = 'M', + [BTRFS_FS_STATE_RO] = 0, + [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', + [BTRFS_FS_STATE_DEV_REPLACING] = 'R', + [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, + [BTRFS_FS_STATE_NO_CSUMS] = 'C', + [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', +}; + +static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) +{ + unsigned int bit; + bool states_printed = false; + unsigned long fs_state = READ_ONCE(info->fs_state); + char *curr = buf; + + memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); + curr += sizeof(STATE_STRING_PREFACE) - 1; + + if (BTRFS_FS_ERROR(info)) { + *curr++ = 'E'; + states_printed = true; + } + + for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { + WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); + if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { + *curr++ = fs_state_chars[bit]; + states_printed = true; + } + } + + /* If no states were printed, reset the buffer */ + if (!states_printed) + curr = buf; + + *curr++ = 0; +} +#endif + +/* + * Generally the error codes correspond to their respective errors, but there + * are a few special cases. + * + * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for + * instance will return EUCLEAN if any of the blocks are corrupted in + * a way that is problematic. We want to reserve EUCLEAN for these + * sort of corruptions. + * + * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we + * need to use EROFS for this case. We will have no idea of the + * original failure, that will have been reported at the time we tripped + * over the error. Each subsequent error that doesn't have any context + * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR. + */ +const char * __attribute_const__ btrfs_decode_error(int errno) +{ + char *errstr = "unknown"; + + switch (errno) { + case -ENOENT: /* -2 */ + errstr = "No such entry"; + break; + case -EIO: /* -5 */ + errstr = "IO failure"; + break; + case -ENOMEM: /* -12*/ + errstr = "Out of memory"; + break; + case -EEXIST: /* -17 */ + errstr = "Object already exists"; + break; + case -ENOSPC: /* -28 */ + errstr = "No space left"; + break; + case -EROFS: /* -30 */ + errstr = "Readonly filesystem"; + break; + case -EOPNOTSUPP: /* -95 */ + errstr = "Operation not supported"; + break; + case -EUCLEAN: /* -117 */ + errstr = "Filesystem corrupted"; + break; + case -EDQUOT: /* -122 */ + errstr = "Quota exceeded"; + break; + } + + return errstr; +} + +/* + * __btrfs_handle_fs_error decodes expected errors from the caller and + * invokes the appropriate error response. + */ +__cold +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + struct super_block *sb = fs_info->sb; +#ifdef CONFIG_PRINTK + char statestr[STATE_STRING_BUF_LEN]; + const char *errstr; +#endif + +#ifdef CONFIG_PRINTK_INDEX + printk_index_subsys_emit( + "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", KERN_CRIT, fmt); +#endif + + /* + * Special case: if the error is EROFS, and we're already under + * SB_RDONLY, then it is safe here. + */ + if (errno == -EROFS && sb_rdonly(sb)) + return; + +#ifdef CONFIG_PRINTK + errstr = btrfs_decode_error(errno); + btrfs_state_to_string(fs_info, statestr); + if (fmt) { + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", + sb->s_id, statestr, function, line, errno, errstr, &vaf); + va_end(args); + } else { + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", + sb->s_id, statestr, function, line, errno, errstr); + } +#endif + + /* + * Today we only save the error info to memory. Long term we'll also + * send it down to the disk. + */ + WRITE_ONCE(fs_info->fs_error, errno); + + /* Don't go through full error handling during mount. */ + if (!(sb->s_flags & SB_BORN)) + return; + + if (sb_rdonly(sb)) + return; + + btrfs_discard_stop(fs_info); + + /* Handle error by forcing the filesystem readonly. */ + btrfs_set_sb_rdonly(sb); + btrfs_info(fs_info, "forced readonly"); + /* + * Note that a running device replace operation is not canceled here + * although there is no way to update the progress. It would add the + * risk of a deadlock, therefore the canceling is omitted. The only + * penalty is that some I/O remains active until the procedure + * completes. The next time when the filesystem is mounted writable + * again, the device replace operation continues. + */ +} + +#ifdef CONFIG_PRINTK +static const char * const logtypes[] = { + "emergency", + "alert", + "critical", + "error", + "warning", + "notice", + "info", + "debug", +}; + +/* + * Use one ratelimit state per log level so that a flood of less important + * messages doesn't cause more important ones to be dropped. + */ +static struct ratelimit_state printk_limits[] = { + RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), +}; + +void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ + char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; + struct va_format vaf; + va_list args; + int kern_level; + const char *type = logtypes[4]; + struct ratelimit_state *ratelimit = &printk_limits[4]; + +#ifdef CONFIG_PRINTK_INDEX + printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); +#endif + + va_start(args, fmt); + + while ((kern_level = printk_get_level(fmt)) != 0) { + size_t size = printk_skip_level(fmt) - fmt; + + if (kern_level >= '0' && kern_level <= '7') { + memcpy(lvl, fmt, size); + lvl[size] = '\0'; + type = logtypes[kern_level - '0']; + ratelimit = &printk_limits[kern_level - '0']; + } + fmt += size; + } + + vaf.fmt = fmt; + vaf.va = &args; + + if (__ratelimit(ratelimit)) { + if (fs_info) { + char statestr[STATE_STRING_BUF_LEN]; + + btrfs_state_to_string(fs_info, statestr); + _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, + fs_info->sb->s_id, statestr, &vaf); + } else { + _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); + } + } + + va_end(args); +} +#endif + +#if BITS_PER_LONG == 32 +void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) +{ + if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) { + btrfs_warn(fs_info, "reaching 32bit limit for logical addresses"); + btrfs_warn(fs_info, +"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT", + BTRFS_32BIT_MAX_FILE_SIZE >> 40); + btrfs_warn(fs_info, + "please consider upgrading to 64bit kernel/hardware"); + } +} + +void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) +{ + if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) { + btrfs_err(fs_info, "reached 32bit limit for logical addresses"); + btrfs_err(fs_info, +"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed", + BTRFS_32BIT_MAX_FILE_SIZE >> 40); + btrfs_err(fs_info, + "please consider upgrading to 64bit kernel/hardware"); + } +} +#endif + +/* + * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an + * alert, and either panics or BUGs, depending on mount options. + */ +__cold +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + char *s_id = ""; + const char *errstr; + struct va_format vaf = { .fmt = fmt }; + va_list args; + + if (fs_info) + s_id = fs_info->sb->s_id; + + va_start(args, fmt); + vaf.va = &args; + + errstr = btrfs_decode_error(errno); + if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) + panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", + s_id, function, line, &vaf, errno, errstr); + + btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", + function, line, &vaf, errno, errstr); + va_end(args); + /* Caller calls BUG() */ +} diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h new file mode 100644 index 0000000000..1ae6f8e23e --- /dev/null +++ b/fs/btrfs/messages.h @@ -0,0 +1,222 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_MESSAGES_H +#define BTRFS_MESSAGES_H + +#include +#include +#include + +struct btrfs_fs_info; + +/* + * We want to be able to override this in btrfs-progs. + */ +#ifdef __KERNEL__ + +static inline __printf(2, 3) __cold +void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ +} + +#endif + +#ifdef CONFIG_PRINTK + +#define btrfs_printk(fs_info, fmt, args...) \ + _btrfs_printk(fs_info, fmt, ##args) + +__printf(2, 3) +__cold +void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); + +#else + +#define btrfs_printk(fs_info, fmt, args...) \ + btrfs_no_printk(fs_info, fmt, ##args) +#endif + +#define btrfs_emerg(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use printk_in_rcu + */ +#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk_in_rcu + */ +#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk + */ +#define btrfs_emerg_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) + +#if defined(CONFIG_DYNAMIC_DEBUG) +#define btrfs_debug(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \ + fs_info, KERN_DEBUG fmt, ##args) +#elif defined(DEBUG) +#define btrfs_debug(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) +#else +#define btrfs_debug(fs_info, fmt, args...) \ + btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) +#endif + +#define btrfs_printk_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_no_printk(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#define btrfs_printk_ratelimited(fs_info, fmt, args...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + btrfs_printk(fs_info, fmt, ##args); \ +} while (0) + +#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk_ratelimited(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#ifdef CONFIG_BTRFS_ASSERT + +#define btrfs_assertfail(expr, file, line) ({ \ + pr_err("assertion failed: %s, in %s:%d\n", (expr), (file), (line)); \ + BUG(); \ +}) + +#define ASSERT(expr) \ + (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) +#else +#define ASSERT(expr) (void)(expr) +#endif + +__printf(5, 6) +__cold +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); + +const char * __attribute_const__ btrfs_decode_error(int errno); + +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args) + +__printf(5, 6) +__cold +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); +/* + * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic + * will panic(). Otherwise we BUG() here. + */ +#define btrfs_panic(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + BUG(); \ +} while (0) + +#if BITS_PER_LONG == 32 +#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT) +/* + * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical + * addresses of extents. + * + * For 4K page size it's about 10T, for 64K it's 160T. + */ +#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8) +void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info); +void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info); +#endif + +#endif diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h new file mode 100644 index 0000000000..40f2d9f1a1 --- /dev/null +++ b/fs/btrfs/misc.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_MISC_H +#define BTRFS_MISC_H + +#include +#include +#include +#include + +/* + * Enumerate bits using enum autoincrement. Define the @name as the n-th bit. + */ +#define ENUM_BIT(name) \ + __ ## name ## _BIT, \ + name = (1U << __ ## name ## _BIT), \ + __ ## name ## _SEQ = __ ## name ## _BIT + +static inline void cond_wake_up(struct wait_queue_head *wq) +{ + /* + * This implies a full smp_mb barrier, see comments for + * waitqueue_active why. + */ + if (wq_has_sleeper(wq)) + wake_up(wq); +} + +static inline void cond_wake_up_nomb(struct wait_queue_head *wq) +{ + /* + * Special case for conditional wakeup where the barrier required for + * waitqueue_active is implied by some of the preceding code. Eg. one + * of such atomic operations (atomic_dec_and_return, ...), or a + * unlock/lock sequence, etc. + */ + if (waitqueue_active(wq)) + wake_up(wq); +} + +static inline u64 mult_perc(u64 num, u32 percent) +{ + return div_u64(num * percent, 100); +} +/* Copy of is_power_of_two that is 64bit safe */ +static inline bool is_power_of_two_u64(u64 n) +{ + return n != 0 && (n & (n - 1)) == 0; +} + +static inline bool has_single_bit_set(u64 n) +{ + return is_power_of_two_u64(n); +} + +/* + * Simple bytenr based rb_tree relate structures + * + * Any structure wants to use bytenr as single search index should have their + * structure start with these members. + */ +struct rb_simple_node { + struct rb_node rb_node; + u64 bytenr; +}; + +static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr) +{ + struct rb_node *node = root->rb_node; + struct rb_simple_node *entry; + + while (node) { + entry = rb_entry(node, struct rb_simple_node, rb_node); + + if (bytenr < entry->bytenr) + node = node->rb_left; + else if (bytenr > entry->bytenr) + node = node->rb_right; + else + return node; + } + return NULL; +} + +/* + * Search @root from an entry that starts or comes after @bytenr. + * + * @root: the root to search. + * @bytenr: bytenr to search from. + * + * Return the rb_node that start at or after @bytenr. If there is no entry at + * or after @bytner return NULL. + */ +static inline struct rb_node *rb_simple_search_first(struct rb_root *root, + u64 bytenr) +{ + struct rb_node *node = root->rb_node, *ret = NULL; + struct rb_simple_node *entry, *ret_entry = NULL; + + while (node) { + entry = rb_entry(node, struct rb_simple_node, rb_node); + + if (bytenr < entry->bytenr) { + if (!ret || entry->bytenr < ret_entry->bytenr) { + ret = node; + ret_entry = entry; + } + + node = node->rb_left; + } else if (bytenr > entry->bytenr) { + node = node->rb_right; + } else { + return node; + } + } + + return ret; +} + +static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct rb_simple_node *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct rb_simple_node, rb_node); + + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return parent; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static inline bool bitmap_test_range_all_set(const unsigned long *addr, + unsigned long start, + unsigned long nbits) +{ + unsigned long found_zero; + + found_zero = find_next_zero_bit(addr, start + nbits, start); + return (found_zero == start + nbits); +} + +static inline bool bitmap_test_range_all_zero(const unsigned long *addr, + unsigned long start, + unsigned long nbits) +{ + unsigned long found_set; + + found_set = find_next_bit(addr, start + nbits, start); + return (found_set == start + nbits); +} + +#endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c new file mode 100644 index 0000000000..2b8ff8b53a --- /dev/null +++ b/fs/btrfs/ordered-data.c @@ -0,0 +1,1264 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include "messages.h" +#include "misc.h" +#include "ctree.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "extent_io.h" +#include "disk-io.h" +#include "compression.h" +#include "delalloc-space.h" +#include "qgroup.h" +#include "subpage.h" +#include "file.h" +#include "super.h" + +static struct kmem_cache *btrfs_ordered_extent_cache; + +static u64 entry_end(struct btrfs_ordered_extent *entry) +{ + if (entry->file_offset + entry->num_bytes < entry->file_offset) + return (u64)-1; + return entry->file_offset + entry->num_bytes; +} + +/* returns NULL if the insertion worked, or it returns the node it did find + * in the tree + */ +static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_ordered_extent *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); + + if (file_offset < entry->file_offset) + p = &(*p)->rb_left; + else if (file_offset >= entry_end(entry)) + p = &(*p)->rb_right; + else + return parent; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +/* + * look for a given offset in the tree, and if it can't be found return the + * first lesser offset + */ +static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, + struct rb_node **prev_ret) +{ + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *test; + struct btrfs_ordered_extent *entry; + struct btrfs_ordered_extent *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); + prev = n; + prev_entry = entry; + + if (file_offset < entry->file_offset) + n = n->rb_left; + else if (file_offset >= entry_end(entry)) + n = n->rb_right; + else + return n; + } + if (!prev_ret) + return NULL; + + while (prev && file_offset >= entry_end(prev_entry)) { + test = rb_next(prev); + if (!test) + break; + prev_entry = rb_entry(test, struct btrfs_ordered_extent, + rb_node); + if (file_offset < entry_end(prev_entry)) + break; + + prev = test; + } + if (prev) + prev_entry = rb_entry(prev, struct btrfs_ordered_extent, + rb_node); + while (prev && file_offset < entry_end(prev_entry)) { + test = rb_prev(prev); + if (!test) + break; + prev_entry = rb_entry(test, struct btrfs_ordered_extent, + rb_node); + prev = test; + } + *prev_ret = prev; + return NULL; +} + +static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, + u64 len) +{ + if (file_offset + len <= entry->file_offset || + entry->file_offset + entry->num_bytes <= file_offset) + return 0; + return 1; +} + +/* + * look find the first ordered struct that has this offset, otherwise + * the first one less than this offset + */ +static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, + u64 file_offset) +{ + struct rb_root *root = &tree->tree; + struct rb_node *prev = NULL; + struct rb_node *ret; + struct btrfs_ordered_extent *entry; + + if (tree->last) { + entry = rb_entry(tree->last, struct btrfs_ordered_extent, + rb_node); + if (in_range(file_offset, entry->file_offset, entry->num_bytes)) + return tree->last; + } + ret = __tree_search(root, file_offset, &prev); + if (!ret) + ret = prev; + if (ret) + tree->last = ret; + return ret; +} + +static struct btrfs_ordered_extent *alloc_ordered_extent( + struct btrfs_inode *inode, u64 file_offset, u64 num_bytes, + u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes, + u64 offset, unsigned long flags, int compress_type) +{ + struct btrfs_ordered_extent *entry; + int ret; + u64 qgroup_rsv = 0; + + if (flags & + ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { + /* For nocow write, we can release the qgroup rsv right now */ + ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); + if (ret < 0) + return ERR_PTR(ret); + } else { + /* + * The ordered extent has reserved qgroup space, release now + * and pass the reserved number for qgroup_record to free. + */ + ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv); + if (ret < 0) + return ERR_PTR(ret); + } + entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); + if (!entry) + return ERR_PTR(-ENOMEM); + + entry->file_offset = file_offset; + entry->num_bytes = num_bytes; + entry->ram_bytes = ram_bytes; + entry->disk_bytenr = disk_bytenr; + entry->disk_num_bytes = disk_num_bytes; + entry->offset = offset; + entry->bytes_left = num_bytes; + entry->inode = igrab(&inode->vfs_inode); + entry->compress_type = compress_type; + entry->truncated_len = (u64)-1; + entry->qgroup_rsv = qgroup_rsv; + entry->flags = flags; + refcount_set(&entry->refs, 1); + init_waitqueue_head(&entry->wait); + INIT_LIST_HEAD(&entry->list); + INIT_LIST_HEAD(&entry->log_list); + INIT_LIST_HEAD(&entry->root_extent_list); + INIT_LIST_HEAD(&entry->work_list); + init_completion(&entry->completion); + + /* + * We don't need the count_max_extents here, we can assume that all of + * that work has been done at higher layers, so this is truly the + * smallest the extent is going to get. + */ + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, 1); + spin_unlock(&inode->lock); + + return entry; +} + +static void insert_ordered_extent(struct btrfs_ordered_extent *entry) +{ + struct btrfs_inode *inode = BTRFS_I(entry->inode); + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct rb_node *node; + + trace_btrfs_ordered_extent_add(inode, entry); + + percpu_counter_add_batch(&fs_info->ordered_bytes, entry->num_bytes, + fs_info->delalloc_batch); + + /* One ref for the tree. */ + refcount_inc(&entry->refs); + + spin_lock_irq(&tree->lock); + node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node); + if (node) + btrfs_panic(fs_info, -EEXIST, + "inconsistency in ordered tree at offset %llu", + entry->file_offset); + spin_unlock_irq(&tree->lock); + + spin_lock(&root->ordered_extent_lock); + list_add_tail(&entry->root_extent_list, + &root->ordered_extents); + root->nr_ordered_extents++; + if (root->nr_ordered_extents == 1) { + spin_lock(&fs_info->ordered_root_lock); + BUG_ON(!list_empty(&root->ordered_root)); + list_add_tail(&root->ordered_root, &fs_info->ordered_roots); + spin_unlock(&fs_info->ordered_root_lock); + } + spin_unlock(&root->ordered_extent_lock); +} + +/* + * Add an ordered extent to the per-inode tree. + * + * @inode: Inode that this extent is for. + * @file_offset: Logical offset in file where the extent starts. + * @num_bytes: Logical length of extent in file. + * @ram_bytes: Full length of unencoded data. + * @disk_bytenr: Offset of extent on disk. + * @disk_num_bytes: Size of extent on disk. + * @offset: Offset into unencoded data where file data starts. + * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). + * @compress_type: Compression algorithm used for data. + * + * Most of these parameters correspond to &struct btrfs_file_extent_item. The + * tree is given a single reference on the ordered extent that was inserted, and + * the returned pointer is given a second reference. + * + * Return: the new ordered extent or error pointer. + */ +struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( + struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned long flags, + int compress_type) +{ + struct btrfs_ordered_extent *entry; + + ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); + + entry = alloc_ordered_extent(inode, file_offset, num_bytes, ram_bytes, + disk_bytenr, disk_num_bytes, offset, flags, + compress_type); + if (!IS_ERR(entry)) + insert_ordered_extent(entry); + return entry; +} + +/* + * Add a struct btrfs_ordered_sum into the list of checksums to be inserted + * when an ordered extent is finished. If the list covers more than one + * ordered extent, it is split across multiples. + */ +void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum) +{ + struct btrfs_ordered_inode_tree *tree; + + tree = &BTRFS_I(entry->inode)->ordered_tree; + spin_lock_irq(&tree->lock); + list_add_tail(&sum->list, &entry->list); + spin_unlock_irq(&tree->lock); +} + +static void finish_ordered_fn(struct btrfs_work *work) +{ + struct btrfs_ordered_extent *ordered_extent; + + ordered_extent = container_of(work, struct btrfs_ordered_extent, work); + btrfs_finish_ordered_io(ordered_extent); +} + +static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct page *page, u64 file_offset, + u64 len, bool uptodate) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + lockdep_assert_held(&inode->ordered_tree.lock); + + if (page) { + ASSERT(page->mapping); + ASSERT(page_offset(page) <= file_offset); + ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE); + + /* + * Ordered (Private2) bit indicates whether we still have + * pending io unfinished for the ordered extent. + * + * If there's no such bit, we need to skip to next range. + */ + if (!btrfs_page_test_ordered(fs_info, page, file_offset, len)) + return false; + btrfs_page_clear_ordered(fs_info, page, file_offset, len); + } + + /* Now we're fine to update the accounting. */ + if (WARN_ON_ONCE(len > ordered->bytes_left)) { + btrfs_crit(fs_info, +"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu", + inode->root->root_key.objectid, btrfs_ino(inode), + ordered->file_offset, ordered->num_bytes, + len, ordered->bytes_left); + ordered->bytes_left = 0; + } else { + ordered->bytes_left -= len; + } + + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + + if (ordered->bytes_left) + return false; + + /* + * All the IO of the ordered extent is finished, we need to queue + * the finish_func to be executed. + */ + set_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags); + cond_wake_up(&ordered->wait); + refcount_inc(&ordered->refs); + trace_btrfs_ordered_extent_mark_finished(inode, ordered); + return true; +} + +static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ? + fs_info->endio_freespace_worker : fs_info->endio_write_workers; + + btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); + btrfs_queue_work(wq, &ordered->work); +} + +bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct page *page, u64 file_offset, u64 len, + bool uptodate) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + unsigned long flags; + bool ret; + + trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); + + spin_lock_irqsave(&inode->ordered_tree.lock, flags); + ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); + spin_unlock_irqrestore(&inode->ordered_tree.lock, flags); + + if (ret) + btrfs_queue_ordered_fn(ordered); + return ret; +} + +/* + * Mark all ordered extents io inside the specified range finished. + * + * @page: The involved page for the operation. + * For uncompressed buffered IO, the page status also needs to be + * updated to indicate whether the pending ordered io is finished. + * Can be NULL for direct IO and compressed write. + * For these cases, callers are ensured they won't execute the + * endio function twice. + * + * This function is called for endio, thus the range must have ordered + * extent(s) covering it. + */ +void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, + struct page *page, u64 file_offset, + u64 num_bytes, bool uptodate) +{ + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + unsigned long flags; + u64 cur = file_offset; + + trace_btrfs_writepage_end_io_hook(inode, file_offset, + file_offset + num_bytes - 1, + uptodate); + + spin_lock_irqsave(&tree->lock, flags); + while (cur < file_offset + num_bytes) { + u64 entry_end; + u64 end; + u32 len; + + node = tree_search(tree, cur); + /* No ordered extents at all */ + if (!node) + break; + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + entry_end = entry->file_offset + entry->num_bytes; + /* + * |<-- OE --->| | + * cur + * Go to next OE. + */ + if (cur >= entry_end) { + node = rb_next(node); + /* No more ordered extents, exit */ + if (!node) + break; + entry = rb_entry(node, struct btrfs_ordered_extent, + rb_node); + + /* Go to next ordered extent and continue */ + cur = entry->file_offset; + continue; + } + /* + * | |<--- OE --->| + * cur + * Go to the start of OE. + */ + if (cur < entry->file_offset) { + cur = entry->file_offset; + continue; + } + + /* + * Now we are definitely inside one ordered extent. + * + * |<--- OE --->| + * | + * cur + */ + end = min(entry->file_offset + entry->num_bytes, + file_offset + num_bytes) - 1; + ASSERT(end + 1 - cur < U32_MAX); + len = end + 1 - cur; + + if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) { + spin_unlock_irqrestore(&tree->lock, flags); + btrfs_queue_ordered_fn(entry); + spin_lock_irqsave(&tree->lock, flags); + } + cur += len; + } + spin_unlock_irqrestore(&tree->lock, flags); +} + +/* + * Finish IO for one ordered extent across a given range. The range can only + * contain one ordered extent. + * + * @cached: The cached ordered extent. If not NULL, we can skip the tree + * search and use the ordered extent directly. + * Will be also used to store the finished ordered extent. + * @file_offset: File offset for the finished IO + * @io_size: Length of the finish IO range + * + * Return true if the ordered extent is finished in the range, and update + * @cached. + * Return false otherwise. + * + * NOTE: The range can NOT cross multiple ordered extents. + * Thus caller should ensure the range doesn't cross ordered extents. + */ +bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **cached, + u64 file_offset, u64 io_size) +{ + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + unsigned long flags; + bool finished = false; + + spin_lock_irqsave(&tree->lock, flags); + if (cached && *cached) { + entry = *cached; + goto have_entry; + } + + node = tree_search(tree, file_offset); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); +have_entry: + if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) + goto out; + + if (io_size > entry->bytes_left) + btrfs_crit(inode->root->fs_info, + "bad ordered accounting left %llu size %llu", + entry->bytes_left, io_size); + + entry->bytes_left -= io_size; + + if (entry->bytes_left == 0) { + /* + * Ensure only one caller can set the flag and finished_ret + * accordingly + */ + finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + /* test_and_set_bit implies a barrier */ + cond_wake_up_nomb(&entry->wait); + } +out: + if (finished && cached && entry) { + *cached = entry; + refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_dec_test_pending(inode, entry); + } + spin_unlock_irqrestore(&tree->lock, flags); + return finished; +} + +/* + * used to drop a reference on an ordered extent. This will free + * the extent if the last reference is dropped + */ +void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) +{ + struct list_head *cur; + struct btrfs_ordered_sum *sum; + + trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry); + + if (refcount_dec_and_test(&entry->refs)) { + ASSERT(list_empty(&entry->root_extent_list)); + ASSERT(list_empty(&entry->log_list)); + ASSERT(RB_EMPTY_NODE(&entry->rb_node)); + if (entry->inode) + btrfs_add_delayed_iput(BTRFS_I(entry->inode)); + while (!list_empty(&entry->list)) { + cur = entry->list.next; + sum = list_entry(cur, struct btrfs_ordered_sum, list); + list_del(&sum->list); + kvfree(sum); + } + kmem_cache_free(btrfs_ordered_extent_cache, entry); + } +} + +/* + * remove an ordered extent from the tree. No references are dropped + * and waiters are woken up. + */ +void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, + struct btrfs_ordered_extent *entry) +{ + struct btrfs_ordered_inode_tree *tree; + struct btrfs_root *root = btrfs_inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct rb_node *node; + bool pending; + bool freespace_inode; + + /* + * If this is a free space inode the thread has not acquired the ordered + * extents lockdep map. + */ + freespace_inode = btrfs_is_free_space_inode(btrfs_inode); + + btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered); + /* This is paired with btrfs_alloc_ordered_extent. */ + spin_lock(&btrfs_inode->lock); + btrfs_mod_outstanding_extents(btrfs_inode, -1); + spin_unlock(&btrfs_inode->lock); + if (root != fs_info->tree_root) { + u64 release; + + if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags)) + release = entry->disk_num_bytes; + else + release = entry->num_bytes; + btrfs_delalloc_release_metadata(btrfs_inode, release, + test_bit(BTRFS_ORDERED_IOERR, + &entry->flags)); + } + + percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, + fs_info->delalloc_batch); + + tree = &btrfs_inode->ordered_tree; + spin_lock_irq(&tree->lock); + node = &entry->rb_node; + rb_erase(node, &tree->tree); + RB_CLEAR_NODE(node); + if (tree->last == node) + tree->last = NULL; + set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); + spin_unlock_irq(&tree->lock); + + /* + * The current running transaction is waiting on us, we need to let it + * know that we're complete and wake it up. + */ + if (pending) { + struct btrfs_transaction *trans; + + /* + * The checks for trans are just a formality, it should be set, + * but if it isn't we don't want to deref/assert under the spin + * lock, so be nice and check if trans is set, but ASSERT() so + * if it isn't set a developer will notice. + */ + spin_lock(&fs_info->trans_lock); + trans = fs_info->running_transaction; + if (trans) + refcount_inc(&trans->use_count); + spin_unlock(&fs_info->trans_lock); + + ASSERT(trans || BTRFS_FS_ERROR(fs_info)); + if (trans) { + if (atomic_dec_and_test(&trans->pending_ordered)) + wake_up(&trans->pending_wait); + btrfs_put_transaction(trans); + } + } + + btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered); + + spin_lock(&root->ordered_extent_lock); + list_del_init(&entry->root_extent_list); + root->nr_ordered_extents--; + + trace_btrfs_ordered_extent_remove(btrfs_inode, entry); + + if (!root->nr_ordered_extents) { + spin_lock(&fs_info->ordered_root_lock); + BUG_ON(list_empty(&root->ordered_root)); + list_del_init(&root->ordered_root); + spin_unlock(&fs_info->ordered_root_lock); + } + spin_unlock(&root->ordered_extent_lock); + wake_up(&entry->wait); + if (!freespace_inode) + btrfs_lockdep_release(fs_info, btrfs_ordered_extent); +} + +static void btrfs_run_ordered_extent_work(struct btrfs_work *work) +{ + struct btrfs_ordered_extent *ordered; + + ordered = container_of(work, struct btrfs_ordered_extent, flush_work); + btrfs_start_ordered_extent(ordered); + complete(&ordered->completion); +} + +/* + * wait for all the ordered extents in a root. This is done when balancing + * space between drives. + */ +u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, + const u64 range_start, const u64 range_len) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + LIST_HEAD(splice); + LIST_HEAD(skipped); + LIST_HEAD(works); + struct btrfs_ordered_extent *ordered, *next; + u64 count = 0; + const u64 range_end = range_start + range_len; + + mutex_lock(&root->ordered_extent_mutex); + spin_lock(&root->ordered_extent_lock); + list_splice_init(&root->ordered_extents, &splice); + while (!list_empty(&splice) && nr) { + ordered = list_first_entry(&splice, struct btrfs_ordered_extent, + root_extent_list); + + if (range_end <= ordered->disk_bytenr || + ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) { + list_move_tail(&ordered->root_extent_list, &skipped); + cond_resched_lock(&root->ordered_extent_lock); + continue; + } + + list_move_tail(&ordered->root_extent_list, + &root->ordered_extents); + refcount_inc(&ordered->refs); + spin_unlock(&root->ordered_extent_lock); + + btrfs_init_work(&ordered->flush_work, + btrfs_run_ordered_extent_work, NULL, NULL); + list_add_tail(&ordered->work_list, &works); + btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); + + cond_resched(); + spin_lock(&root->ordered_extent_lock); + if (nr != U64_MAX) + nr--; + count++; + } + list_splice_tail(&skipped, &root->ordered_extents); + list_splice_tail(&splice, &root->ordered_extents); + spin_unlock(&root->ordered_extent_lock); + + list_for_each_entry_safe(ordered, next, &works, work_list) { + list_del_init(&ordered->work_list); + wait_for_completion(&ordered->completion); + btrfs_put_ordered_extent(ordered); + cond_resched(); + } + mutex_unlock(&root->ordered_extent_mutex); + + return count; +} + +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, + const u64 range_start, const u64 range_len) +{ + struct btrfs_root *root; + LIST_HEAD(splice); + u64 done; + + mutex_lock(&fs_info->ordered_operations_mutex); + spin_lock(&fs_info->ordered_root_lock); + list_splice_init(&fs_info->ordered_roots, &splice); + while (!list_empty(&splice) && nr) { + root = list_first_entry(&splice, struct btrfs_root, + ordered_root); + root = btrfs_grab_root(root); + BUG_ON(!root); + list_move_tail(&root->ordered_root, + &fs_info->ordered_roots); + spin_unlock(&fs_info->ordered_root_lock); + + done = btrfs_wait_ordered_extents(root, nr, + range_start, range_len); + btrfs_put_root(root); + + spin_lock(&fs_info->ordered_root_lock); + if (nr != U64_MAX) { + nr -= done; + } + } + list_splice_tail(&splice, &fs_info->ordered_roots); + spin_unlock(&fs_info->ordered_root_lock); + mutex_unlock(&fs_info->ordered_operations_mutex); +} + +/* + * Start IO and wait for a given ordered extent to finish. + * + * Wait on page writeback for all the pages in the extent and the IO completion + * code to insert metadata into the btree corresponding to the extent. + */ +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) +{ + u64 start = entry->file_offset; + u64 end = start + entry->num_bytes - 1; + struct btrfs_inode *inode = BTRFS_I(entry->inode); + bool freespace_inode; + + trace_btrfs_ordered_extent_start(inode, entry); + + /* + * If this is a free space inode do not take the ordered extents lockdep + * map. + */ + freespace_inode = btrfs_is_free_space_inode(inode); + + /* + * pages in the range can be dirty, clean or writeback. We + * start IO on any dirty ones so the wait doesn't stall waiting + * for the flusher thread to find them + */ + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + + if (!freespace_inode) + btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); + wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); +} + +/* + * Used to wait on ordered extents across a large range of bytes. + */ +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) +{ + int ret = 0; + int ret_wb = 0; + u64 end; + u64 orig_end; + struct btrfs_ordered_extent *ordered; + + if (start + len < start) { + orig_end = OFFSET_MAX; + } else { + orig_end = start + len - 1; + if (orig_end > OFFSET_MAX) + orig_end = OFFSET_MAX; + } + + /* start IO across the range first to instantiate any delalloc + * extents + */ + ret = btrfs_fdatawrite_range(inode, start, orig_end); + if (ret) + return ret; + + /* + * If we have a writeback error don't return immediately. Wait first + * for any ordered extents that haven't completed yet. This is to make + * sure no one can dirty the same page ranges and call writepages() + * before the ordered extents complete - to avoid failures (-EEXIST) + * when adding the new ordered extents to the ordered tree. + */ + ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end); + + end = orig_end; + while (1) { + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end); + if (!ordered) + break; + if (ordered->file_offset > orig_end) { + btrfs_put_ordered_extent(ordered); + break; + } + if (ordered->file_offset + ordered->num_bytes <= start) { + btrfs_put_ordered_extent(ordered); + break; + } + btrfs_start_ordered_extent(ordered); + end = ordered->file_offset; + /* + * If the ordered extent had an error save the error but don't + * exit without waiting first for all other ordered extents in + * the range to complete. + */ + if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + ret = -EIO; + btrfs_put_ordered_extent(ordered); + if (end == 0 || end == start) + break; + end--; + } + return ret_wb ? ret_wb : ret; +} + +/* + * find an ordered extent corresponding to file_offset. return NULL if + * nothing is found, otherwise take a reference on the extent and return it + */ +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, + u64 file_offset) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + unsigned long flags; + + tree = &inode->ordered_tree; + spin_lock_irqsave(&tree->lock, flags); + node = tree_search(tree, file_offset); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) + entry = NULL; + if (entry) { + refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup(inode, entry); + } +out: + spin_unlock_irqrestore(&tree->lock, flags); + return entry; +} + +/* Since the DIO code tries to lock a wide area we need to look for any ordered + * extents that exist in the range, rather than just the start of the range. + */ +struct btrfs_ordered_extent *btrfs_lookup_ordered_range( + struct btrfs_inode *inode, u64 file_offset, u64 len) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &inode->ordered_tree; + spin_lock_irq(&tree->lock); + node = tree_search(tree, file_offset); + if (!node) { + node = tree_search(tree, file_offset + len); + if (!node) + goto out; + } + + while (1) { + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (range_overlaps(entry, file_offset, len)) + break; + + if (entry->file_offset >= file_offset + len) { + entry = NULL; + break; + } + entry = NULL; + node = rb_next(node); + if (!node) + break; + } +out: + if (entry) { + refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_range(inode, entry); + } + spin_unlock_irq(&tree->lock); + return entry; +} + +/* + * Adds all ordered extents to the given list. The list ends up sorted by the + * file_offset of the ordered extents. + */ +void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, + struct list_head *list) +{ + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct rb_node *n; + + ASSERT(inode_is_locked(&inode->vfs_inode)); + + spin_lock_irq(&tree->lock); + for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + struct btrfs_ordered_extent *ordered; + + ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); + + if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + continue; + + ASSERT(list_empty(&ordered->log_list)); + list_add_tail(&ordered->log_list, list); + refcount_inc(&ordered->refs); + trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); + } + spin_unlock_irq(&tree->lock); +} + +/* + * lookup and return any extent before 'file_offset'. NULL is returned + * if none is found + */ +struct btrfs_ordered_extent * +btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &inode->ordered_tree; + spin_lock_irq(&tree->lock); + node = tree_search(tree, file_offset); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_first(inode, entry); +out: + spin_unlock_irq(&tree->lock); + return entry; +} + +/* + * Lookup the first ordered extent that overlaps the range + * [@file_offset, @file_offset + @len). + * + * The difference between this and btrfs_lookup_first_ordered_extent() is + * that this one won't return any ordered extent that does not overlap the range. + * And the difference against btrfs_lookup_ordered_extent() is, this function + * ensures the first ordered extent gets returned. + */ +struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( + struct btrfs_inode *inode, u64 file_offset, u64 len) +{ + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct rb_node *node; + struct rb_node *cur; + struct rb_node *prev; + struct rb_node *next; + struct btrfs_ordered_extent *entry = NULL; + + spin_lock_irq(&tree->lock); + node = tree->tree.rb_node; + /* + * Here we don't want to use tree_search() which will use tree->last + * and screw up the search order. + * And __tree_search() can't return the adjacent ordered extents + * either, thus here we do our own search. + */ + while (node) { + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + + if (file_offset < entry->file_offset) { + node = node->rb_left; + } else if (file_offset >= entry_end(entry)) { + node = node->rb_right; + } else { + /* + * Direct hit, got an ordered extent that starts at + * @file_offset + */ + goto out; + } + } + if (!entry) { + /* Empty tree */ + goto out; + } + + cur = &entry->rb_node; + /* We got an entry around @file_offset, check adjacent entries */ + if (entry->file_offset < file_offset) { + prev = cur; + next = rb_next(cur); + } else { + prev = rb_prev(cur); + next = cur; + } + if (prev) { + entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node); + if (range_overlaps(entry, file_offset, len)) + goto out; + } + if (next) { + entry = rb_entry(next, struct btrfs_ordered_extent, rb_node); + if (range_overlaps(entry, file_offset, len)) + goto out; + } + /* No ordered extent in the range */ + entry = NULL; +out: + if (entry) { + refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_first_range(inode, entry); + } + + spin_unlock_irq(&tree->lock); + return entry; +} + +/* + * Lock the passed range and ensures all pending ordered extents in it are run + * to completion. + * + * @inode: Inode whose ordered tree is to be searched + * @start: Beginning of range to flush + * @end: Last byte of range to lock + * @cached_state: If passed, will return the extent state responsible for the + * locked range. It's the caller's responsibility to free the + * cached state. + * + * Always return with the given range locked, ensuring after it's called no + * order extent can be pending. + */ +void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, + u64 end, + struct extent_state **cached_state) +{ + struct btrfs_ordered_extent *ordered; + struct extent_state *cache = NULL; + struct extent_state **cachedp = &cache; + + if (cached_state) + cachedp = cached_state; + + while (1) { + lock_extent(&inode->io_tree, start, end, cachedp); + ordered = btrfs_lookup_ordered_range(inode, start, + end - start + 1); + if (!ordered) { + /* + * If no external cached_state has been passed then + * decrement the extra ref taken for cachedp since we + * aren't exposing it outside of this function + */ + if (!cached_state) + refcount_dec(&cache->refs); + break; + } + unlock_extent(&inode->io_tree, start, end, cachedp); + btrfs_start_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } +} + +/* + * Lock the passed range and ensure all pending ordered extents in it are run + * to completion in nowait mode. + * + * Return true if btrfs_lock_ordered_range does not return any extents, + * otherwise false. + */ +bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state) +{ + struct btrfs_ordered_extent *ordered; + + if (!try_lock_extent(&inode->io_tree, start, end, cached_state)) + return false; + + ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); + if (!ordered) + return true; + + btrfs_put_ordered_extent(ordered); + unlock_extent(&inode->io_tree, start, end, cached_state); + + return false; +} + +/* Split out a new ordered extent for this first @len bytes of @ordered. */ +struct btrfs_ordered_extent *btrfs_split_ordered_extent( + struct btrfs_ordered_extent *ordered, u64 len) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 file_offset = ordered->file_offset; + u64 disk_bytenr = ordered->disk_bytenr; + unsigned long flags = ordered->flags; + struct btrfs_ordered_sum *sum, *tmpsum; + struct btrfs_ordered_extent *new; + struct rb_node *node; + u64 offset = 0; + + trace_btrfs_ordered_extent_split(inode, ordered); + + ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED))); + + /* + * The entire bio must be covered by the ordered extent, but we can't + * reduce the original extent to a zero length either. + */ + if (WARN_ON_ONCE(len >= ordered->num_bytes)) + return ERR_PTR(-EINVAL); + /* We cannot split partially completed ordered extents. */ + if (ordered->bytes_left) { + ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); + if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) + return ERR_PTR(-EINVAL); + } + /* We cannot split a compressed ordered extent. */ + if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) + return ERR_PTR(-EINVAL); + + new = alloc_ordered_extent(inode, file_offset, len, len, disk_bytenr, + len, 0, flags, ordered->compress_type); + if (IS_ERR(new)) + return new; + + /* One ref for the tree. */ + refcount_inc(&new->refs); + + spin_lock_irq(&root->ordered_extent_lock); + spin_lock(&tree->lock); + /* Remove from tree once */ + node = &ordered->rb_node; + rb_erase(node, &tree->tree); + RB_CLEAR_NODE(node); + if (tree->last == node) + tree->last = NULL; + + ordered->file_offset += len; + ordered->disk_bytenr += len; + ordered->num_bytes -= len; + ordered->disk_num_bytes -= len; + + if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) { + ASSERT(ordered->bytes_left == 0); + new->bytes_left = 0; + } else { + ordered->bytes_left -= len; + } + + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) { + if (ordered->truncated_len > len) { + ordered->truncated_len -= len; + } else { + new->truncated_len = ordered->truncated_len; + ordered->truncated_len = 0; + } + } + + list_for_each_entry_safe(sum, tmpsum, &ordered->list, list) { + if (offset == len) + break; + list_move_tail(&sum->list, &new->list); + offset += sum->len; + } + + /* Re-insert the node */ + node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node); + if (node) + btrfs_panic(fs_info, -EEXIST, + "zoned: inconsistency in ordered tree at offset %llu", + ordered->file_offset); + + node = tree_insert(&tree->tree, new->file_offset, &new->rb_node); + if (node) + btrfs_panic(fs_info, -EEXIST, + "zoned: inconsistency in ordered tree at offset %llu", + new->file_offset); + spin_unlock(&tree->lock); + + list_add_tail(&new->root_extent_list, &root->ordered_extents); + root->nr_ordered_extents++; + spin_unlock_irq(&root->ordered_extent_lock); + return new; +} + +int __init ordered_data_init(void) +{ + btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", + sizeof(struct btrfs_ordered_extent), 0, + SLAB_MEM_SPREAD, + NULL); + if (!btrfs_ordered_extent_cache) + return -ENOMEM; + + return 0; +} + +void __cold ordered_data_exit(void) +{ + kmem_cache_destroy(btrfs_ordered_extent_cache); +} diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h new file mode 100644 index 0000000000..173bd5c5df --- /dev/null +++ b/fs/btrfs/ordered-data.h @@ -0,0 +1,214 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#ifndef BTRFS_ORDERED_DATA_H +#define BTRFS_ORDERED_DATA_H + +/* one of these per inode */ +struct btrfs_ordered_inode_tree { + spinlock_t lock; + struct rb_root tree; + struct rb_node *last; +}; + +struct btrfs_ordered_sum { + /* + * Logical start address and length for of the blocks covered by + * the sums array. + */ + u64 logical; + u32 len; + + struct list_head list; + /* last field is a variable length array of csums */ + u8 sums[]; +}; + +/* + * Bits for btrfs_ordered_extent::flags. + * + * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written. + * It is used to make sure metadata is inserted into the tree only once + * per extent. + * + * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the + * rbtree, just before waking any waiters. It is used to indicate the + * IO is done and any metadata is inserted into the tree. + */ +enum { + /* + * Different types for ordered extents, one and only one of the 4 types + * need to be set when creating ordered extent. + * + * REGULAR: For regular non-compressed COW write + * NOCOW: For NOCOW write into existing non-hole extent + * PREALLOC: For NOCOW write into preallocated extent + * COMPRESSED: For compressed COW write + */ + BTRFS_ORDERED_REGULAR, + BTRFS_ORDERED_NOCOW, + BTRFS_ORDERED_PREALLOC, + BTRFS_ORDERED_COMPRESSED, + + /* + * Extra bit for direct io, can only be set for + * REGULAR/NOCOW/PREALLOC. No direct io for compressed extent. + */ + BTRFS_ORDERED_DIRECT, + + /* Extra status bits for ordered extents */ + + /* set when all the pages are written */ + BTRFS_ORDERED_IO_DONE, + /* set when removed from the tree */ + BTRFS_ORDERED_COMPLETE, + /* We had an io error when writing this out */ + BTRFS_ORDERED_IOERR, + /* Set when we have to truncate an extent */ + BTRFS_ORDERED_TRUNCATED, + /* Used during fsync to track already logged extents */ + BTRFS_ORDERED_LOGGED, + /* We have already logged all the csums of the ordered extent */ + BTRFS_ORDERED_LOGGED_CSUM, + /* We wait for this extent to complete in the current transaction */ + BTRFS_ORDERED_PENDING, + /* BTRFS_IOC_ENCODED_WRITE */ + BTRFS_ORDERED_ENCODED, +}; + +/* BTRFS_ORDERED_* flags that specify the type of the extent. */ +#define BTRFS_ORDERED_TYPE_FLAGS ((1UL << BTRFS_ORDERED_REGULAR) | \ + (1UL << BTRFS_ORDERED_NOCOW) | \ + (1UL << BTRFS_ORDERED_PREALLOC) | \ + (1UL << BTRFS_ORDERED_COMPRESSED) | \ + (1UL << BTRFS_ORDERED_DIRECT) | \ + (1UL << BTRFS_ORDERED_ENCODED)) + +struct btrfs_ordered_extent { + /* logical offset in the file */ + u64 file_offset; + + /* + * These fields directly correspond to the same fields in + * btrfs_file_extent_item. + */ + u64 num_bytes; + u64 ram_bytes; + u64 disk_bytenr; + u64 disk_num_bytes; + u64 offset; + + /* number of bytes that still need writing */ + u64 bytes_left; + + /* + * the end of the ordered extent which is behind it but + * didn't update disk_i_size. Please see the comment of + * btrfs_ordered_update_i_size(); + */ + u64 outstanding_isize; + + /* + * If we get truncated we need to adjust the file extent we enter for + * this ordered extent so that we do not expose stale data. + */ + u64 truncated_len; + + /* flags (described above) */ + unsigned long flags; + + /* compression algorithm */ + int compress_type; + + /* Qgroup reserved space */ + int qgroup_rsv; + + /* reference count */ + refcount_t refs; + + /* the inode we belong to */ + struct inode *inode; + + /* list of checksums for insertion when the extent io is done */ + struct list_head list; + + /* used for fast fsyncs */ + struct list_head log_list; + + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ + wait_queue_head_t wait; + + /* our friendly rbtree entry */ + struct rb_node rb_node; + + /* a per root list of all the pending ordered extents */ + struct list_head root_extent_list; + + struct btrfs_work work; + + struct completion completion; + struct btrfs_work flush_work; + struct list_head work_list; +}; + +static inline void +btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) +{ + spin_lock_init(&t->lock); + t->tree = RB_ROOT; + t->last = NULL; +} + +int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent); +int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); + +void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); +void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, + struct btrfs_ordered_extent *entry); +bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct page *page, u64 file_offset, u64 len, + bool uptodate); +void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, + struct page *page, u64 file_offset, + u64 num_bytes, bool uptodate); +bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **cached, + u64 file_offset, u64 io_size); +struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( + struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned long flags, + int compress_type); +void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum); +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, + u64 file_offset); +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); +struct btrfs_ordered_extent * +btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); +struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( + struct btrfs_inode *inode, u64 file_offset, u64 len); +struct btrfs_ordered_extent *btrfs_lookup_ordered_range( + struct btrfs_inode *inode, + u64 file_offset, + u64 len); +void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, + struct list_head *list); +u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, + const u64 range_start, const u64 range_len); +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, + const u64 range_start, const u64 range_len); +void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, + u64 end, + struct extent_state **cached_state); +bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state); +struct btrfs_ordered_extent *btrfs_split_ordered_extent( + struct btrfs_ordered_extent *ordered, u64 len); +int __init ordered_data_init(void); +void __cold ordered_data_exit(void); + +#endif diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c new file mode 100644 index 0000000000..7a1b021b56 --- /dev/null +++ b/fs/btrfs/orphan.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2008 Red Hat. All rights reserved. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "orphan.h" + +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + + btrfs_free_path(path); + return ret; +} + +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret) { /* JDM: Really? */ + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); + +out: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/orphan.h b/fs/btrfs/orphan.h new file mode 100644 index 0000000000..3faab5cbb5 --- /dev/null +++ b/fs/btrfs/orphan.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ORPHAN_H +#define BTRFS_ORPHAN_H + +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); + +#endif diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c new file mode 100644 index 0000000000..0c93439e92 --- /dev/null +++ b/fs/btrfs/print-tree.c @@ -0,0 +1,414 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include "messages.h" +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "accessors.h" +#include "tree-checker.h" + +struct root_name_map { + u64 id; + char name[16]; +}; + +static const struct root_name_map root_map[] = { + { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" }, + { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" }, + { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" }, + { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" }, + { BTRFS_FS_TREE_OBJECTID, "FS_TREE" }, + { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" }, + { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" }, + { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" }, + { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" }, + { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, + { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" }, + { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }, +}; + +const char *btrfs_root_name(const struct btrfs_key *key, char *buf) +{ + int i; + + if (key->objectid == BTRFS_TREE_RELOC_OBJECTID) { + snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, + "TREE_RELOC offset=%llu", key->offset); + return buf; + } + + for (i = 0; i < ARRAY_SIZE(root_map); i++) { + if (root_map[i].id == key->objectid) + return root_map[i].name; + } + + snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, "%llu", key->objectid); + return buf; +} + +static void print_chunk(const struct extent_buffer *eb, struct btrfs_chunk *chunk) +{ + int num_stripes = btrfs_chunk_num_stripes(eb, chunk); + int i; + pr_info("\t\tchunk length %llu owner %llu type %llu num_stripes %d\n", + btrfs_chunk_length(eb, chunk), btrfs_chunk_owner(eb, chunk), + btrfs_chunk_type(eb, chunk), num_stripes); + for (i = 0 ; i < num_stripes ; i++) { + pr_info("\t\t\tstripe %d devid %llu offset %llu\n", i, + btrfs_stripe_devid_nr(eb, chunk, i), + btrfs_stripe_offset_nr(eb, chunk, i)); + } +} +static void print_dev_item(const struct extent_buffer *eb, + struct btrfs_dev_item *dev_item) +{ + pr_info("\t\tdev item devid %llu total_bytes %llu bytes used %llu\n", + btrfs_device_id(eb, dev_item), + btrfs_device_total_bytes(eb, dev_item), + btrfs_device_bytes_used(eb, dev_item)); +} +static void print_extent_data_ref(const struct extent_buffer *eb, + struct btrfs_extent_data_ref *ref) +{ + pr_cont("extent data backref root %llu objectid %llu offset %llu count %u\n", + btrfs_extent_data_ref_root(eb, ref), + btrfs_extent_data_ref_objectid(eb, ref), + btrfs_extent_data_ref_offset(eb, ref), + btrfs_extent_data_ref_count(eb, ref)); +} + +static void print_extent_item(const struct extent_buffer *eb, int slot, int type) +{ + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct btrfs_disk_key key; + unsigned long end; + unsigned long ptr; + u32 item_size = btrfs_item_size(eb, slot); + u64 flags; + u64 offset; + int ref_index = 0; + + if (unlikely(item_size < sizeof(*ei))) { + btrfs_err(eb->fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); + btrfs_handle_fs_error(eb->fs_info, -EUCLEAN, NULL); + } + + ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(eb, ei); + + pr_info("\t\textent refs %llu gen %llu flags %llu\n", + btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei), + flags); + + if ((type == BTRFS_EXTENT_ITEM_KEY) && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + info = (struct btrfs_tree_block_info *)(ei + 1); + btrfs_tree_block_key(eb, info, &key); + pr_info("\t\ttree block key (%llu %u %llu) level %d\n", + btrfs_disk_key_objectid(&key), key.type, + btrfs_disk_key_offset(&key), + btrfs_tree_block_level(eb, info)); + iref = (struct btrfs_extent_inline_ref *)(info + 1); + } else { + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + while (ptr < end) { + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(eb, iref); + offset = btrfs_extent_inline_ref_offset(eb, iref); + pr_info("\t\tref#%d: ", ref_index++); + switch (type) { + case BTRFS_TREE_BLOCK_REF_KEY: + pr_cont("tree block backref root %llu\n", offset); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + pr_cont("shared block backref parent %llu\n", offset); + /* + * offset is supposed to be a tree block which + * must be aligned to nodesize. + */ + if (!IS_ALIGNED(offset, eb->fs_info->sectorsize)) + pr_info( + "\t\t\t(parent %llu not aligned to sectorsize %u)\n", + offset, eb->fs_info->sectorsize); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + print_extent_data_ref(eb, dref); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = (struct btrfs_shared_data_ref *)(iref + 1); + pr_cont("shared data backref parent %llu count %u\n", + offset, btrfs_shared_data_ref_count(eb, sref)); + /* + * Offset is supposed to be a tree block which must be + * aligned to sectorsize. + */ + if (!IS_ALIGNED(offset, eb->fs_info->sectorsize)) + pr_info( + "\t\t\t(parent %llu not aligned to sectorsize %u)\n", + offset, eb->fs_info->sectorsize); + break; + default: + pr_cont("(extent %llu has INVALID ref type %d)\n", + eb->start, type); + return; + } + ptr += btrfs_extent_inline_ref_size(type); + } + WARN_ON(ptr > end); +} + +static void print_uuid_item(const struct extent_buffer *l, unsigned long offset, + u32 item_size) +{ + if (!IS_ALIGNED(item_size, sizeof(u64))) { + pr_warn("BTRFS: uuid item with illegal size %lu!\n", + (unsigned long)item_size); + return; + } + while (item_size) { + __le64 subvol_id; + + read_extent_buffer(l, &subvol_id, offset, sizeof(subvol_id)); + pr_info("\t\tsubvol_id %llu\n", le64_to_cpu(subvol_id)); + item_size -= sizeof(u64); + offset += sizeof(u64); + } +} + +/* + * Helper to output refs and locking status of extent buffer. Useful to debug + * race condition related problems. + */ +static void print_eb_refs_lock(const struct extent_buffer *eb) +{ +#ifdef CONFIG_BTRFS_DEBUG + btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u", + atomic_read(&eb->refs), eb->lock_owner, current->pid); +#endif +} + +void btrfs_print_leaf(const struct extent_buffer *l) +{ + struct btrfs_fs_info *fs_info; + int i; + u32 type, nr; + struct btrfs_root_item *ri; + struct btrfs_dir_item *di; + struct btrfs_inode_item *ii; + struct btrfs_block_group_item *bi; + struct btrfs_file_extent_item *fi; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct btrfs_dev_extent *dev_extent; + struct btrfs_key key; + struct btrfs_key found_key; + + if (!l) + return; + + fs_info = l->fs_info; + nr = btrfs_header_nritems(l); + + btrfs_info(fs_info, + "leaf %llu gen %llu total ptrs %d free space %d owner %llu", + btrfs_header_bytenr(l), btrfs_header_generation(l), nr, + btrfs_leaf_free_space(l), btrfs_header_owner(l)); + print_eb_refs_lock(l); + for (i = 0 ; i < nr ; i++) { + btrfs_item_key_to_cpu(l, &key, i); + type = key.type; + pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n", + i, key.objectid, type, key.offset, + btrfs_item_offset(l, i), btrfs_item_size(l, i)); + switch (type) { + case BTRFS_INODE_ITEM_KEY: + ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); + pr_info("\t\tinode generation %llu size %llu mode %o\n", + btrfs_inode_generation(l, ii), + btrfs_inode_size(l, ii), + btrfs_inode_mode(l, ii)); + break; + case BTRFS_DIR_ITEM_KEY: + di = btrfs_item_ptr(l, i, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(l, di, &found_key); + pr_info("\t\tdir oid %llu flags %u\n", + found_key.objectid, + btrfs_dir_flags(l, di)); + break; + case BTRFS_ROOT_ITEM_KEY: + ri = btrfs_item_ptr(l, i, struct btrfs_root_item); + pr_info("\t\troot data bytenr %llu refs %u\n", + btrfs_disk_root_bytenr(l, ri), + btrfs_disk_root_refs(l, ri)); + break; + case BTRFS_EXTENT_ITEM_KEY: + case BTRFS_METADATA_ITEM_KEY: + print_extent_item(l, i, type); + break; + case BTRFS_TREE_BLOCK_REF_KEY: + pr_info("\t\ttree block backref\n"); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + pr_info("\t\tshared block backref\n"); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = btrfs_item_ptr(l, i, + struct btrfs_extent_data_ref); + print_extent_data_ref(l, dref); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = btrfs_item_ptr(l, i, + struct btrfs_shared_data_ref); + pr_info("\t\tshared data backref count %u\n", + btrfs_shared_data_ref_count(l, sref)); + break; + case BTRFS_EXTENT_DATA_KEY: + fi = btrfs_item_ptr(l, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(l, fi) == + BTRFS_FILE_EXTENT_INLINE) { + pr_info("\t\tinline extent data size %llu\n", + btrfs_file_extent_ram_bytes(l, fi)); + break; + } + pr_info("\t\textent data disk bytenr %llu nr %llu\n", + btrfs_file_extent_disk_bytenr(l, fi), + btrfs_file_extent_disk_num_bytes(l, fi)); + pr_info("\t\textent data offset %llu nr %llu ram %llu\n", + btrfs_file_extent_offset(l, fi), + btrfs_file_extent_num_bytes(l, fi), + btrfs_file_extent_ram_bytes(l, fi)); + break; + case BTRFS_BLOCK_GROUP_ITEM_KEY: + bi = btrfs_item_ptr(l, i, + struct btrfs_block_group_item); + pr_info( + "\t\tblock group used %llu chunk_objectid %llu flags %llu\n", + btrfs_block_group_used(l, bi), + btrfs_block_group_chunk_objectid(l, bi), + btrfs_block_group_flags(l, bi)); + break; + case BTRFS_CHUNK_ITEM_KEY: + print_chunk(l, btrfs_item_ptr(l, i, + struct btrfs_chunk)); + break; + case BTRFS_DEV_ITEM_KEY: + print_dev_item(l, btrfs_item_ptr(l, i, + struct btrfs_dev_item)); + break; + case BTRFS_DEV_EXTENT_KEY: + dev_extent = btrfs_item_ptr(l, i, + struct btrfs_dev_extent); + pr_info("\t\tdev extent chunk_tree %llu\n\t\tchunk objectid %llu chunk offset %llu length %llu\n", + btrfs_dev_extent_chunk_tree(l, dev_extent), + btrfs_dev_extent_chunk_objectid(l, dev_extent), + btrfs_dev_extent_chunk_offset(l, dev_extent), + btrfs_dev_extent_length(l, dev_extent)); + break; + case BTRFS_PERSISTENT_ITEM_KEY: + pr_info("\t\tpersistent item objectid %llu offset %llu\n", + key.objectid, key.offset); + switch (key.objectid) { + case BTRFS_DEV_STATS_OBJECTID: + pr_info("\t\tdevice stats\n"); + break; + default: + pr_info("\t\tunknown persistent item\n"); + } + break; + case BTRFS_TEMPORARY_ITEM_KEY: + pr_info("\t\ttemporary item objectid %llu offset %llu\n", + key.objectid, key.offset); + switch (key.objectid) { + case BTRFS_BALANCE_OBJECTID: + pr_info("\t\tbalance status\n"); + break; + default: + pr_info("\t\tunknown temporary item\n"); + } + break; + case BTRFS_DEV_REPLACE_KEY: + pr_info("\t\tdev replace\n"); + break; + case BTRFS_UUID_KEY_SUBVOL: + case BTRFS_UUID_KEY_RECEIVED_SUBVOL: + print_uuid_item(l, btrfs_item_ptr_offset(l, i), + btrfs_item_size(l, i)); + break; + } + } +} + +void btrfs_print_tree(const struct extent_buffer *c, bool follow) +{ + struct btrfs_fs_info *fs_info; + int i; u32 nr; + struct btrfs_key key; + int level; + + if (!c) + return; + fs_info = c->fs_info; + nr = btrfs_header_nritems(c); + level = btrfs_header_level(c); + if (level == 0) { + btrfs_print_leaf(c); + return; + } + btrfs_info(fs_info, + "node %llu level %d gen %llu total ptrs %d free spc %u owner %llu", + btrfs_header_bytenr(c), level, btrfs_header_generation(c), + nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr, + btrfs_header_owner(c)); + print_eb_refs_lock(c); + for (i = 0; i < nr; i++) { + btrfs_node_key_to_cpu(c, &key, i); + pr_info("\tkey %d (%llu %u %llu) block %llu gen %llu\n", + i, key.objectid, key.type, key.offset, + btrfs_node_blockptr(c, i), + btrfs_node_ptr_generation(c, i)); + } + if (!follow) + return; + for (i = 0; i < nr; i++) { + struct btrfs_tree_parent_check check = { + .level = level - 1, + .transid = btrfs_node_ptr_generation(c, i), + .owner_root = btrfs_header_owner(c), + .has_first_key = true + }; + struct extent_buffer *next; + + btrfs_node_key_to_cpu(c, &check.first_key, i); + next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), &check); + if (IS_ERR(next)) + continue; + if (!extent_buffer_uptodate(next)) { + free_extent_buffer(next); + continue; + } + + if (btrfs_is_leaf(next) && + level != 1) + BUG(); + if (btrfs_header_level(next) != + level - 1) + BUG(); + btrfs_print_tree(next, follow); + free_extent_buffer(next); + } +} diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h new file mode 100644 index 0000000000..c42bc666d5 --- /dev/null +++ b/fs/btrfs/print-tree.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#ifndef BTRFS_PRINT_TREE_H +#define BTRFS_PRINT_TREE_H + +/* Buffer size to contain tree name and possibly additional data (offset) */ +#define BTRFS_ROOT_NAME_BUF_LEN 48 + +void btrfs_print_leaf(const struct extent_buffer *l); +void btrfs_print_tree(const struct extent_buffer *c, bool follow); +const char *btrfs_root_name(const struct btrfs_key *key, char *buf); + +#endif diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c new file mode 100644 index 0000000000..0755af0e53 --- /dev/null +++ b/fs/btrfs/props.c @@ -0,0 +1,473 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2014 Filipe David Borba Manana + */ + +#include +#include "messages.h" +#include "props.h" +#include "btrfs_inode.h" +#include "transaction.h" +#include "ctree.h" +#include "xattr.h" +#include "compression.h" +#include "space-info.h" +#include "fs.h" +#include "accessors.h" +#include "super.h" + +#define BTRFS_PROP_HANDLERS_HT_BITS 8 +static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); + +struct prop_handler { + struct hlist_node node; + const char *xattr_name; + int (*validate)(const struct btrfs_inode *inode, const char *value, + size_t len); + int (*apply)(struct inode *inode, const char *value, size_t len); + const char *(*extract)(struct inode *inode); + bool (*ignore)(const struct btrfs_inode *inode); + int inheritable; +}; + +static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash) +{ + struct hlist_head *h; + + h = &prop_handlers_ht[hash_min(hash, BTRFS_PROP_HANDLERS_HT_BITS)]; + if (hlist_empty(h)) + return NULL; + + return h; +} + +static const struct prop_handler * +find_prop_handler(const char *name, + const struct hlist_head *handlers) +{ + struct prop_handler *h; + + if (!handlers) { + u64 hash = btrfs_name_hash(name, strlen(name)); + + handlers = find_prop_handlers_by_hash(hash); + if (!handlers) + return NULL; + } + + hlist_for_each_entry(h, handlers, node) + if (!strcmp(h->xattr_name, name)) + return h; + + return NULL; +} + +int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, + const char *value, size_t value_len) +{ + const struct prop_handler *handler; + + if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN) + return -EINVAL; + + handler = find_prop_handler(name, NULL); + if (!handler) + return -EINVAL; + + if (value_len == 0) + return 0; + + return handler->validate(inode, value, value_len); +} + +/* + * Check if a property should be ignored (not set) for an inode. + * + * @inode: The target inode. + * @name: The property's name. + * + * The caller must be sure the given property name is valid, for example by + * having previously called btrfs_validate_prop(). + * + * Returns: true if the property should be ignored for the given inode + * false if the property must not be ignored for the given inode + */ +bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name) +{ + const struct prop_handler *handler; + + handler = find_prop_handler(name, NULL); + ASSERT(handler != NULL); + + return handler->ignore(inode); +} + +int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, + const char *name, const char *value, size_t value_len, + int flags) +{ + const struct prop_handler *handler; + int ret; + + handler = find_prop_handler(name, NULL); + if (!handler) + return -EINVAL; + + if (value_len == 0) { + ret = btrfs_setxattr(trans, inode, handler->xattr_name, + NULL, 0, flags); + if (ret) + return ret; + + ret = handler->apply(inode, NULL, 0); + ASSERT(ret == 0); + + return ret; + } + + ret = btrfs_setxattr(trans, inode, handler->xattr_name, value, + value_len, flags); + if (ret) + return ret; + ret = handler->apply(inode, value, value_len); + if (ret) { + btrfs_setxattr(trans, inode, handler->xattr_name, NULL, + 0, flags); + return ret; + } + + set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); + + return 0; +} + +static int iterate_object_props(struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, + void (*iterator)(void *, + const struct prop_handler *, + const char *, + size_t), + void *ctx) +{ + int ret; + char *name_buf = NULL; + char *value_buf = NULL; + int name_buf_len = 0; + int value_buf_len = 0; + + while (1) { + struct btrfs_key key; + struct btrfs_dir_item *di; + struct extent_buffer *leaf; + u32 total_len, cur, this_len; + int slot; + const struct hlist_head *handlers; + + slot = path->slots[0]; + leaf = path->nodes[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != objectid) + break; + if (key.type != BTRFS_XATTR_ITEM_KEY) + break; + + handlers = find_prop_handlers_by_hash(key.offset); + if (!handlers) + goto next_slot; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + cur = 0; + total_len = btrfs_item_size(leaf, slot); + + while (cur < total_len) { + u32 name_len = btrfs_dir_name_len(leaf, di); + u32 data_len = btrfs_dir_data_len(leaf, di); + unsigned long name_ptr, data_ptr; + const struct prop_handler *handler; + + this_len = sizeof(*di) + name_len + data_len; + name_ptr = (unsigned long)(di + 1); + data_ptr = name_ptr + name_len; + + if (name_len <= XATTR_BTRFS_PREFIX_LEN || + memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX, + name_ptr, + XATTR_BTRFS_PREFIX_LEN)) + goto next_dir_item; + + if (name_len >= name_buf_len) { + kfree(name_buf); + name_buf_len = name_len + 1; + name_buf = kmalloc(name_buf_len, GFP_NOFS); + if (!name_buf) { + ret = -ENOMEM; + goto out; + } + } + read_extent_buffer(leaf, name_buf, name_ptr, name_len); + name_buf[name_len] = '\0'; + + handler = find_prop_handler(name_buf, handlers); + if (!handler) + goto next_dir_item; + + if (data_len > value_buf_len) { + kfree(value_buf); + value_buf_len = data_len; + value_buf = kmalloc(data_len, GFP_NOFS); + if (!value_buf) { + ret = -ENOMEM; + goto out; + } + } + read_extent_buffer(leaf, value_buf, data_ptr, data_len); + + iterator(ctx, handler, value_buf, data_len); +next_dir_item: + cur += this_len; + di = (struct btrfs_dir_item *)((char *) di + this_len); + } + +next_slot: + path->slots[0]++; + } + + ret = 0; +out: + btrfs_release_path(path); + kfree(name_buf); + kfree(value_buf); + + return ret; +} + +static void inode_prop_iterator(void *ctx, + const struct prop_handler *handler, + const char *value, + size_t len) +{ + struct inode *inode = ctx; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + ret = handler->apply(inode, value, len); + if (unlikely(ret)) + btrfs_warn(root->fs_info, + "error applying prop %s to ino %llu (root %llu): %d", + handler->xattr_name, btrfs_ino(BTRFS_I(inode)), + root->root_key.objectid, ret); + else + set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); +} + +int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 ino = btrfs_ino(BTRFS_I(inode)); + + return iterate_object_props(root, path, ino, inode_prop_iterator, inode); +} + +static int prop_compression_validate(const struct btrfs_inode *inode, + const char *value, size_t len) +{ + if (!btrfs_inode_can_compress(inode)) + return -EINVAL; + + if (!value) + return 0; + + if (btrfs_compress_is_valid_type(value, len)) + return 0; + + if ((len == 2 && strncmp("no", value, 2) == 0) || + (len == 4 && strncmp("none", value, 4) == 0)) + return 0; + + return -EINVAL; +} + +static int prop_compression_apply(struct inode *inode, const char *value, + size_t len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + int type; + + /* Reset to defaults */ + if (len == 0) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + return 0; + } + + /* Set NOCOMPRESS flag */ + if ((len == 2 && strncmp("no", value, 2) == 0) || + (len == 4 && strncmp("none", value, 4) == 0)) { + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + + return 0; + } + + if (!strncmp("lzo", value, 3)) { + type = BTRFS_COMPRESS_LZO; + btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); + } else if (!strncmp("zlib", value, 4)) { + type = BTRFS_COMPRESS_ZLIB; + } else if (!strncmp("zstd", value, 4)) { + type = BTRFS_COMPRESS_ZSTD; + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); + } else { + return -EINVAL; + } + + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->prop_compress = type; + + return 0; +} + +static bool prop_compression_ignore(const struct btrfs_inode *inode) +{ + /* + * Compression only has effect for regular files, and for directories + * we set it just to propagate it to new files created inside them. + * Everything else (symlinks, devices, sockets, fifos) is pointless as + * it will do nothing, so don't waste metadata space on a compression + * xattr for anything that is neither a file nor a directory. + */ + if (!S_ISREG(inode->vfs_inode.i_mode) && + !S_ISDIR(inode->vfs_inode.i_mode)) + return true; + + return false; +} + +static const char *prop_compression_extract(struct inode *inode) +{ + switch (BTRFS_I(inode)->prop_compress) { + case BTRFS_COMPRESS_ZLIB: + case BTRFS_COMPRESS_LZO: + case BTRFS_COMPRESS_ZSTD: + return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress); + default: + break; + } + + return NULL; +} + +static struct prop_handler prop_handlers[] = { + { + .xattr_name = XATTR_BTRFS_PREFIX "compression", + .validate = prop_compression_validate, + .apply = prop_compression_apply, + .extract = prop_compression_extract, + .ignore = prop_compression_ignore, + .inheritable = 1 + }, +}; + +int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *parent) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + int i; + bool need_reserve = false; + + if (!test_bit(BTRFS_INODE_HAS_PROPS, + &BTRFS_I(parent)->runtime_flags)) + return 0; + + for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { + const struct prop_handler *h = &prop_handlers[i]; + const char *value; + u64 num_bytes = 0; + + if (!h->inheritable) + continue; + + if (h->ignore(BTRFS_I(inode))) + continue; + + value = h->extract(parent); + if (!value) + continue; + + /* + * This is not strictly necessary as the property should be + * valid, but in case it isn't, don't propagate it further. + */ + ret = h->validate(BTRFS_I(inode), value, strlen(value)); + if (ret) + continue; + + /* + * Currently callers should be reserving 1 item for properties, + * since we only have 1 property that we currently support. If + * we add more in the future we need to try and reserve more + * space for them. But we should also revisit how we do space + * reservations if we do add more properties in the future. + */ + if (need_reserve) { + num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + ret = btrfs_block_rsv_add(fs_info, trans->block_rsv, + num_bytes, + BTRFS_RESERVE_NO_FLUSH); + if (ret) + return ret; + } + + ret = btrfs_setxattr(trans, inode, h->xattr_name, value, + strlen(value), 0); + if (!ret) { + ret = h->apply(inode, value, strlen(value)); + if (ret) + btrfs_setxattr(trans, inode, h->xattr_name, + NULL, 0, 0); + else + set_bit(BTRFS_INODE_HAS_PROPS, + &BTRFS_I(inode)->runtime_flags); + } + + if (need_reserve) { + btrfs_block_rsv_release(fs_info, trans->block_rsv, + num_bytes, NULL); + if (ret) + return ret; + } + need_reserve = true; + } + + return 0; +} + +int __init btrfs_props_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { + struct prop_handler *p = &prop_handlers[i]; + u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name)); + + hash_add(prop_handlers_ht, &p->node, h); + } + return 0; +} + diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h new file mode 100644 index 0000000000..6e283196e3 --- /dev/null +++ b/fs/btrfs/props.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2014 Filipe David Borba Manana + */ + +#ifndef BTRFS_PROPS_H +#define BTRFS_PROPS_H + +#include "ctree.h" + +int __init btrfs_props_init(void); + +int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, + const char *name, const char *value, size_t value_len, + int flags); +int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, + const char *value, size_t value_len); +bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name); + +int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); + +int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, + struct inode *dir); + +#endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c new file mode 100644 index 0000000000..a006f5160e --- /dev/null +++ b/fs/btrfs/qgroup.c @@ -0,0 +1,4443 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2011 STRATO. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "locking.h" +#include "ulist.h" +#include "backref.h" +#include "extent_io.h" +#include "qgroup.h" +#include "block-group.h" +#include "sysfs.h" +#include "tree-mod-log.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "tree-checker.h" + +/* + * Helpers to access qgroup reservation + * + * Callers should ensure the lock context and type are valid + */ + +static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup) +{ + u64 ret = 0; + int i; + + for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) + ret += qgroup->rsv.values[i]; + + return ret; +} + +#ifdef CONFIG_BTRFS_DEBUG +static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type) +{ + if (type == BTRFS_QGROUP_RSV_DATA) + return "data"; + if (type == BTRFS_QGROUP_RSV_META_PERTRANS) + return "meta_pertrans"; + if (type == BTRFS_QGROUP_RSV_META_PREALLOC) + return "meta_prealloc"; + return NULL; +} +#endif + +static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup, u64 num_bytes, + enum btrfs_qgroup_rsv_type type) +{ + trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); + qgroup->rsv.values[type] += num_bytes; +} + +static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup, u64 num_bytes, + enum btrfs_qgroup_rsv_type type) +{ + trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); + if (qgroup->rsv.values[type] >= num_bytes) { + qgroup->rsv.values[type] -= num_bytes; + return; + } +#ifdef CONFIG_BTRFS_DEBUG + WARN_RATELIMIT(1, + "qgroup %llu %s reserved space underflow, have %llu to free %llu", + qgroup->qgroupid, qgroup_rsv_type_str(type), + qgroup->rsv.values[type], num_bytes); +#endif + qgroup->rsv.values[type] = 0; +} + +static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *dest, + struct btrfs_qgroup *src) +{ + int i; + + for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) + qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i); +} + +static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *dest, + struct btrfs_qgroup *src) +{ + int i; + + for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) + qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i); +} + +static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, + int mod) +{ + if (qg->old_refcnt < seq) + qg->old_refcnt = seq; + qg->old_refcnt += mod; +} + +static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, + int mod) +{ + if (qg->new_refcnt < seq) + qg->new_refcnt = seq; + qg->new_refcnt += mod; +} + +static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) +{ + if (qg->old_refcnt < seq) + return 0; + return qg->old_refcnt - seq; +} + +static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) +{ + if (qg->new_refcnt < seq) + return 0; + return qg->new_refcnt - seq; +} + +/* + * glue structure to represent the relations between qgroups. + */ +struct btrfs_qgroup_list { + struct list_head next_group; + struct list_head next_member; + struct btrfs_qgroup *group; + struct btrfs_qgroup *member; +}; + +static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg) +{ + return (u64)(uintptr_t)qg; +} + +static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n) +{ + return (struct btrfs_qgroup *)(uintptr_t)n->aux; +} + +static int +qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, + int init_flags); +static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); + +/* must be called with qgroup_ioctl_lock held */ +static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, + u64 qgroupid) +{ + struct rb_node *n = fs_info->qgroup_tree.rb_node; + struct btrfs_qgroup *qgroup; + + while (n) { + qgroup = rb_entry(n, struct btrfs_qgroup, node); + if (qgroup->qgroupid < qgroupid) + n = n->rb_left; + else if (qgroup->qgroupid > qgroupid) + n = n->rb_right; + else + return qgroup; + } + return NULL; +} + +/* must be called with qgroup_lock held */ +static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, + u64 qgroupid) +{ + struct rb_node **p = &fs_info->qgroup_tree.rb_node; + struct rb_node *parent = NULL; + struct btrfs_qgroup *qgroup; + + while (*p) { + parent = *p; + qgroup = rb_entry(parent, struct btrfs_qgroup, node); + + if (qgroup->qgroupid < qgroupid) + p = &(*p)->rb_left; + else if (qgroup->qgroupid > qgroupid) + p = &(*p)->rb_right; + else + return qgroup; + } + + qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC); + if (!qgroup) + return ERR_PTR(-ENOMEM); + + qgroup->qgroupid = qgroupid; + INIT_LIST_HEAD(&qgroup->groups); + INIT_LIST_HEAD(&qgroup->members); + INIT_LIST_HEAD(&qgroup->dirty); + INIT_LIST_HEAD(&qgroup->iterator); + + rb_link_node(&qgroup->node, parent, p); + rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); + + return qgroup; +} + +static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) +{ + struct btrfs_qgroup_list *list; + + list_del(&qgroup->dirty); + while (!list_empty(&qgroup->groups)) { + list = list_first_entry(&qgroup->groups, + struct btrfs_qgroup_list, next_group); + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + } + + while (!list_empty(&qgroup->members)) { + list = list_first_entry(&qgroup->members, + struct btrfs_qgroup_list, next_member); + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + } +} + +/* must be called with qgroup_lock held */ +static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) +{ + struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); + + if (!qgroup) + return -ENOENT; + + rb_erase(&qgroup->node, &fs_info->qgroup_tree); + __del_qgroup_rb(fs_info, qgroup); + return 0; +} + +/* + * Add relation specified by two qgroups. + * + * Must be called with qgroup_lock held. + * + * Return: 0 on success + * -ENOENT if one of the qgroups is NULL + * <0 other errors + */ +static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent) +{ + struct btrfs_qgroup_list *list; + + if (!member || !parent) + return -ENOENT; + + list = kzalloc(sizeof(*list), GFP_ATOMIC); + if (!list) + return -ENOMEM; + + list->group = parent; + list->member = member; + list_add_tail(&list->next_group, &member->groups); + list_add_tail(&list->next_member, &parent->members); + + return 0; +} + +/* + * Add relation specified by two qgroup ids. + * + * Must be called with qgroup_lock held. + * + * Return: 0 on success + * -ENOENT if one of the ids does not exist + * <0 other errors + */ +static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) +{ + struct btrfs_qgroup *member; + struct btrfs_qgroup *parent; + + member = find_qgroup_rb(fs_info, memberid); + parent = find_qgroup_rb(fs_info, parentid); + + return __add_relation_rb(member, parent); +} + +/* Must be called with qgroup_lock held */ +static int del_relation_rb(struct btrfs_fs_info *fs_info, + u64 memberid, u64 parentid) +{ + struct btrfs_qgroup *member; + struct btrfs_qgroup *parent; + struct btrfs_qgroup_list *list; + + member = find_qgroup_rb(fs_info, memberid); + parent = find_qgroup_rb(fs_info, parentid); + if (!member || !parent) + return -ENOENT; + + list_for_each_entry(list, &member->groups, next_group) { + if (list->group == parent) { + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + return 0; + } + } + return -ENOENT; +} + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl) +{ + struct btrfs_qgroup *qgroup; + + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) + return -EINVAL; + if (qgroup->rfer != rfer || qgroup->excl != excl) + return -EINVAL; + return 0; +} +#endif + +static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) +{ + fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | + BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | + BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); +} + +/* + * The full config is read in one go, only called from open_ctree() + * It doesn't use any locking, as at this point we're still single-threaded + */ +int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_path *path = NULL; + struct extent_buffer *l; + int slot; + int ret = 0; + u64 flags = 0; + u64 rescan_progress = 0; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + + fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); + if (!fs_info->qgroup_ulist) { + ret = -ENOMEM; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_sysfs_add_qgroups(fs_info); + if (ret < 0) + goto out; + /* default this to quota off, in case no status key is found */ + fs_info->qgroup_flags = 0; + + /* + * pass 1: read status, all qgroup infos and limits + */ + key.objectid = 0; + key.type = 0; + key.offset = 0; + ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1); + if (ret) + goto out; + + while (1) { + struct btrfs_qgroup *qgroup; + + slot = path->slots[0]; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + + if (found_key.type == BTRFS_QGROUP_STATUS_KEY) { + struct btrfs_qgroup_status_item *ptr; + + ptr = btrfs_item_ptr(l, slot, + struct btrfs_qgroup_status_item); + + if (btrfs_qgroup_status_version(l, ptr) != + BTRFS_QGROUP_STATUS_VERSION) { + btrfs_err(fs_info, + "old qgroup version, quota disabled"); + goto out; + } + if (btrfs_qgroup_status_generation(l, ptr) != + fs_info->generation) { + qgroup_mark_inconsistent(fs_info); + btrfs_err(fs_info, + "qgroup generation mismatch, marked as inconsistent"); + } + fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, + ptr); + rescan_progress = btrfs_qgroup_status_rescan(l, ptr); + goto next1; + } + + if (found_key.type != BTRFS_QGROUP_INFO_KEY && + found_key.type != BTRFS_QGROUP_LIMIT_KEY) + goto next1; + + qgroup = find_qgroup_rb(fs_info, found_key.offset); + if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || + (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { + btrfs_err(fs_info, "inconsistent qgroup config"); + qgroup_mark_inconsistent(fs_info); + } + if (!qgroup) { + qgroup = add_qgroup_rb(fs_info, found_key.offset); + if (IS_ERR(qgroup)) { + ret = PTR_ERR(qgroup); + goto out; + } + } + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); + if (ret < 0) + goto out; + + switch (found_key.type) { + case BTRFS_QGROUP_INFO_KEY: { + struct btrfs_qgroup_info_item *ptr; + + ptr = btrfs_item_ptr(l, slot, + struct btrfs_qgroup_info_item); + qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr); + qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr); + qgroup->excl = btrfs_qgroup_info_excl(l, ptr); + qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr); + /* generation currently unused */ + break; + } + case BTRFS_QGROUP_LIMIT_KEY: { + struct btrfs_qgroup_limit_item *ptr; + + ptr = btrfs_item_ptr(l, slot, + struct btrfs_qgroup_limit_item); + qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr); + qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr); + qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr); + qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr); + qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr); + break; + } + } +next1: + ret = btrfs_next_item(quota_root, path); + if (ret < 0) + goto out; + if (ret) + break; + } + btrfs_release_path(path); + + /* + * pass 2: read all qgroup relations + */ + key.objectid = 0; + key.type = BTRFS_QGROUP_RELATION_KEY; + key.offset = 0; + ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0); + if (ret) + goto out; + while (1) { + slot = path->slots[0]; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + + if (found_key.type != BTRFS_QGROUP_RELATION_KEY) + goto next2; + + if (found_key.objectid > found_key.offset) { + /* parent <- member, not needed to build config */ + /* FIXME should we omit the key completely? */ + goto next2; + } + + ret = add_relation_rb(fs_info, found_key.objectid, + found_key.offset); + if (ret == -ENOENT) { + btrfs_warn(fs_info, + "orphan qgroup relation 0x%llx->0x%llx", + found_key.objectid, found_key.offset); + ret = 0; /* ignore the error */ + } + if (ret) + goto out; +next2: + ret = btrfs_next_item(quota_root, path); + if (ret < 0) + goto out; + if (ret) + break; + } +out: + btrfs_free_path(path); + fs_info->qgroup_flags |= flags; + if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) + clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && + ret >= 0) + ret = qgroup_rescan_init(fs_info, rescan_progress, 0); + + if (ret < 0) { + ulist_free(fs_info->qgroup_ulist); + fs_info->qgroup_ulist = NULL; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + btrfs_sysfs_del_qgroups(fs_info); + } + + return ret < 0 ? ret : 0; +} + +/* + * Called in close_ctree() when quota is still enabled. This verifies we don't + * leak some reserved space. + * + * Return false if no reserved space is left. + * Return true if some reserved space is leaked. + */ +bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) +{ + struct rb_node *node; + bool ret = false; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return ret; + /* + * Since we're unmounting, there is no race and no need to grab qgroup + * lock. And here we don't go post-order to provide a more user + * friendly sorted result. + */ + for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) { + struct btrfs_qgroup *qgroup; + int i; + + qgroup = rb_entry(node, struct btrfs_qgroup, node); + for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { + if (qgroup->rsv.values[i]) { + ret = true; + btrfs_warn(fs_info, + "qgroup %hu/%llu has unreleased space, type %d rsv %llu", + btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid), + i, qgroup->rsv.values[i]); + } + } + } + return ret; +} + +/* + * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), + * first two are in single-threaded paths.And for the third one, we have set + * quota_root to be null with qgroup_lock held before, so it is safe to clean + * up the in-memory structures without qgroup_lock held. + */ +void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) +{ + struct rb_node *n; + struct btrfs_qgroup *qgroup; + + while ((n = rb_first(&fs_info->qgroup_tree))) { + qgroup = rb_entry(n, struct btrfs_qgroup, node); + rb_erase(n, &fs_info->qgroup_tree); + __del_qgroup_rb(fs_info, qgroup); + btrfs_sysfs_del_one_qgroup(fs_info, qgroup); + kfree(qgroup); + } + /* + * We call btrfs_free_qgroup_config() when unmounting + * filesystem and disabling quota, so we set qgroup_ulist + * to be null here to avoid double free. + */ + ulist_free(fs_info->qgroup_ulist); + fs_info->qgroup_ulist = NULL; + btrfs_sysfs_del_qgroups(fs_info); +} + +static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, + u64 dst) +{ + int ret; + struct btrfs_root *quota_root = trans->fs_info->quota_root; + struct btrfs_path *path; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = src; + key.type = BTRFS_QGROUP_RELATION_KEY; + key.offset = dst; + + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); + + btrfs_mark_buffer_dirty(trans, path->nodes[0]); + + btrfs_free_path(path); + return ret; +} + +static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, + u64 dst) +{ + int ret; + struct btrfs_root *quota_root = trans->fs_info->quota_root; + struct btrfs_path *path; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = src; + key.type = BTRFS_QGROUP_RELATION_KEY; + key.offset = dst; + + ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, quota_root, path); +out: + btrfs_free_path(path); + return ret; +} + +static int add_qgroup_item(struct btrfs_trans_handle *trans, + struct btrfs_root *quota_root, u64 qgroupid) +{ + int ret; + struct btrfs_path *path; + struct btrfs_qgroup_info_item *qgroup_info; + struct btrfs_qgroup_limit_item *qgroup_limit; + struct extent_buffer *leaf; + struct btrfs_key key; + + if (btrfs_is_testing(quota_root->fs_info)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = 0; + key.type = BTRFS_QGROUP_INFO_KEY; + key.offset = qgroupid; + + /* + * Avoid a transaction abort by catching -EEXIST here. In that + * case, we proceed by re-initializing the existing structure + * on disk. + */ + + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, + sizeof(*qgroup_info)); + if (ret && ret != -EEXIST) + goto out; + + leaf = path->nodes[0]; + qgroup_info = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_qgroup_info_item); + btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid); + btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0); + btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0); + btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); + btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); + + btrfs_mark_buffer_dirty(trans, leaf); + + btrfs_release_path(path); + + key.type = BTRFS_QGROUP_LIMIT_KEY; + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, + sizeof(*qgroup_limit)); + if (ret && ret != -EEXIST) + goto out; + + leaf = path->nodes[0]; + qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_qgroup_limit_item); + btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); + + btrfs_mark_buffer_dirty(trans, leaf); + + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) +{ + int ret; + struct btrfs_root *quota_root = trans->fs_info->quota_root; + struct btrfs_path *path; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = 0; + key.type = BTRFS_QGROUP_INFO_KEY; + key.offset = qgroupid; + ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, quota_root, path); + if (ret) + goto out; + + btrfs_release_path(path); + + key.type = BTRFS_QGROUP_LIMIT_KEY; + ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, quota_root, path); + +out: + btrfs_free_path(path); + return ret; +} + +static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, + struct btrfs_qgroup *qgroup) +{ + struct btrfs_root *quota_root = trans->fs_info->quota_root; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_qgroup_limit_item *qgroup_limit; + int ret; + int slot; + + key.objectid = 0; + key.type = BTRFS_QGROUP_LIMIT_KEY; + key.offset = qgroup->qgroupid; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); + if (ret > 0) + ret = -ENOENT; + + if (ret) + goto out; + + l = path->nodes[0]; + slot = path->slots[0]; + qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); + btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags); + btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer); + btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); + btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); + btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); + + btrfs_mark_buffer_dirty(trans, l); + +out: + btrfs_free_path(path); + return ret; +} + +static int update_qgroup_info_item(struct btrfs_trans_handle *trans, + struct btrfs_qgroup *qgroup) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_qgroup_info_item *qgroup_info; + int ret; + int slot; + + if (btrfs_is_testing(fs_info)) + return 0; + + key.objectid = 0; + key.type = BTRFS_QGROUP_INFO_KEY; + key.offset = qgroup->qgroupid; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); + if (ret > 0) + ret = -ENOENT; + + if (ret) + goto out; + + l = path->nodes[0]; + slot = path->slots[0]; + qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); + btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); + btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); + btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); + btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); + btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); + + btrfs_mark_buffer_dirty(trans, l); + +out: + btrfs_free_path(path); + return ret; +} + +static int update_qgroup_status_item(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_qgroup_status_item *ptr; + int ret; + int slot; + + key.objectid = 0; + key.type = BTRFS_QGROUP_STATUS_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); + if (ret > 0) + ret = -ENOENT; + + if (ret) + goto out; + + l = path->nodes[0]; + slot = path->slots[0]; + ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); + btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAGS_MASK); + btrfs_set_qgroup_status_generation(l, ptr, trans->transid); + btrfs_set_qgroup_status_rescan(l, ptr, + fs_info->qgroup_rescan_progress.objectid); + + btrfs_mark_buffer_dirty(trans, l); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * called with qgroup_lock held + */ +static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf = NULL; + int ret; + int nr = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = 0; + key.offset = 0; + key.type = 0; + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + leaf = path->nodes[0]; + nr = btrfs_header_nritems(leaf); + if (!nr) + break; + /* + * delete the leaf one by one + * since the whole tree is going + * to be deleted. + */ + path->slots[0] = 0; + ret = btrfs_del_items(trans, root, path, 0, nr); + if (ret) + goto out; + + btrfs_release_path(path); + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_quota_enable(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *quota_root; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_path *path = NULL; + struct btrfs_qgroup_status_item *ptr; + struct extent_buffer *leaf; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_qgroup *qgroup = NULL; + struct btrfs_trans_handle *trans = NULL; + struct ulist *ulist = NULL; + int ret = 0; + int slot; + + /* + * We need to have subvol_sem write locked, to prevent races between + * concurrent tasks trying to enable quotas, because we will unlock + * and relock qgroup_ioctl_lock before setting fs_info->quota_root + * and before setting BTRFS_FS_QUOTA_ENABLED. + */ + lockdep_assert_held_write(&fs_info->subvol_sem); + + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, + "qgroups are currently unsupported in extent tree v2"); + return -EINVAL; + } + + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (fs_info->quota_root) + goto out; + + ulist = ulist_alloc(GFP_KERNEL); + if (!ulist) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_sysfs_add_qgroups(fs_info); + if (ret < 0) + goto out; + + /* + * Unlock qgroup_ioctl_lock before starting the transaction. This is to + * avoid lock acquisition inversion problems (reported by lockdep) between + * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we + * start a transaction. + * After we started the transaction lock qgroup_ioctl_lock again and + * check if someone else created the quota root in the meanwhile. If so, + * just return success and release the transaction handle. + * + * Also we don't need to worry about someone else calling + * btrfs_sysfs_add_qgroups() after we unlock and getting an error because + * that function returns 0 (success) when the sysfs entries already exist. + */ + mutex_unlock(&fs_info->qgroup_ioctl_lock); + + /* + * 1 for quota root item + * 1 for BTRFS_QGROUP_STATUS item + * + * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items + * per subvolume. However those are not currently reserved since it + * would be a lot of overkill. + */ + trans = btrfs_start_transaction(tree_root, 2); + + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + + if (fs_info->quota_root) + goto out; + + fs_info->qgroup_ulist = ulist; + ulist = NULL; + + /* + * initially create the quota tree + */ + quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID); + if (IS_ERR(quota_root)) { + ret = PTR_ERR(quota_root); + btrfs_abort_transaction(trans, ret); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); + goto out_free_root; + } + + key.objectid = 0; + key.type = BTRFS_QGROUP_STATUS_KEY; + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, + sizeof(*ptr)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } + + leaf = path->nodes[0]; + ptr = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_qgroup_status_item); + btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); + btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); + fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAGS_MASK); + btrfs_set_qgroup_status_rescan(leaf, ptr, 0); + + btrfs_mark_buffer_dirty(trans, leaf); + + key.objectid = 0; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = 0; + + btrfs_release_path(path); + ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); + if (ret > 0) + goto out_add_root; + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } + + while (1) { + slot = path->slots[0]; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.type == BTRFS_ROOT_REF_KEY) { + + /* Release locks on tree_root before we access quota_root */ + btrfs_release_path(path); + + ret = add_qgroup_item(trans, quota_root, + found_key.offset); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } + + qgroup = add_qgroup_rb(fs_info, found_key.offset); + if (IS_ERR(qgroup)) { + ret = PTR_ERR(qgroup); + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } + ret = btrfs_search_slot_for_read(tree_root, &found_key, + path, 1, 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } + if (ret > 0) { + /* + * Shouldn't happen, but in case it does we + * don't need to do the btrfs_next_item, just + * continue. + */ + continue; + } + } + ret = btrfs_next_item(tree_root, path); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } + if (ret) + break; + } + +out_add_root: + btrfs_release_path(path); + ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } + + qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID); + if (IS_ERR(qgroup)) { + ret = PTR_ERR(qgroup); + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } + + mutex_unlock(&fs_info->qgroup_ioctl_lock); + /* + * Commit the transaction while not holding qgroup_ioctl_lock, to avoid + * a deadlock with tasks concurrently doing other qgroup operations, such + * adding/removing qgroups or adding/deleting qgroup relations for example, + * because all qgroup operations first start or join a transaction and then + * lock the qgroup_ioctl_lock mutex. + * We are safe from a concurrent task trying to enable quotas, by calling + * this function, since we are serialized by fs_info->subvol_sem. + */ + ret = btrfs_commit_transaction(trans); + trans = NULL; + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (ret) + goto out_free_path; + + /* + * Set quota enabled flag after committing the transaction, to avoid + * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot + * creation. + */ + spin_lock(&fs_info->qgroup_lock); + fs_info->quota_root = quota_root; + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + spin_unlock(&fs_info->qgroup_lock); + + ret = qgroup_rescan_init(fs_info, 0, 1); + if (!ret) { + qgroup_rescan_zero_tracking(fs_info); + fs_info->qgroup_rescan_running = true; + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); + } else { + /* + * We have set both BTRFS_FS_QUOTA_ENABLED and + * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with + * -EINPROGRESS. That can happen because someone started the + * rescan worker by calling quota rescan ioctl before we + * attempted to initialize the rescan worker. Failure due to + * quotas disabled in the meanwhile is not possible, because + * we are holding a write lock on fs_info->subvol_sem, which + * is also acquired when disabling quotas. + * Ignore such error, and any other error would need to undo + * everything we did in the transaction we just committed. + */ + ASSERT(ret == -EINPROGRESS); + ret = 0; + } + +out_free_path: + btrfs_free_path(path); +out_free_root: + if (ret) + btrfs_put_root(quota_root); +out: + if (ret) { + ulist_free(fs_info->qgroup_ulist); + fs_info->qgroup_ulist = NULL; + btrfs_sysfs_del_qgroups(fs_info); + } + mutex_unlock(&fs_info->qgroup_ioctl_lock); + if (ret && trans) + btrfs_end_transaction(trans); + else if (trans) + ret = btrfs_end_transaction(trans); + ulist_free(ulist); + return ret; +} + +int btrfs_quota_disable(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *quota_root; + struct btrfs_trans_handle *trans = NULL; + int ret = 0; + + /* + * We need to have subvol_sem write locked to prevent races with + * snapshot creation. + */ + lockdep_assert_held_write(&fs_info->subvol_sem); + + /* + * Lock the cleaner mutex to prevent races with concurrent relocation, + * because relocation may be building backrefs for blocks of the quota + * root while we are deleting the root. This is like dropping fs roots + * of deleted snapshots/subvolumes, we need the same protection. + * + * This also prevents races between concurrent tasks trying to disable + * quotas, because we will unlock and relock qgroup_ioctl_lock across + * BTRFS_FS_QUOTA_ENABLED changes. + */ + mutex_lock(&fs_info->cleaner_mutex); + + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_root) + goto out; + + /* + * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to + * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs + * to lock that mutex while holding a transaction handle and the rescan + * worker needs to commit a transaction. + */ + mutex_unlock(&fs_info->qgroup_ioctl_lock); + + /* + * Request qgroup rescan worker to complete and wait for it. This wait + * must be done before transaction start for quota disable since it may + * deadlock with transaction by the qgroup rescan worker. + */ + clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + btrfs_qgroup_wait_for_completion(fs_info, false); + + /* + * 1 For the root item + * + * We should also reserve enough items for the quota tree deletion in + * btrfs_clean_quota_tree but this is not done. + * + * Also, we must always start a transaction without holding the mutex + * qgroup_ioctl_lock, see btrfs_quota_enable(). + */ + trans = btrfs_start_transaction(fs_info->tree_root, 1); + + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + goto out; + } + + if (!fs_info->quota_root) + goto out; + + spin_lock(&fs_info->qgroup_lock); + quota_root = fs_info->quota_root; + fs_info->quota_root = NULL; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; + fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; + spin_unlock(&fs_info->qgroup_lock); + + btrfs_free_qgroup_config(fs_info); + + ret = btrfs_clean_quota_tree(trans, quota_root); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = btrfs_del_root(trans, "a_root->root_key); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + spin_lock(&fs_info->trans_lock); + list_del("a_root->dirty_list); + spin_unlock(&fs_info->trans_lock); + + btrfs_tree_lock(quota_root->node); + btrfs_clear_buffer_dirty(trans, quota_root->node); + btrfs_tree_unlock(quota_root->node); + btrfs_free_tree_block(trans, btrfs_root_id(quota_root), + quota_root->node, 0, 1); + + btrfs_put_root(quota_root); + +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + if (ret && trans) + btrfs_end_transaction(trans); + else if (trans) + ret = btrfs_end_transaction(trans); + mutex_unlock(&fs_info->cleaner_mutex); + + return ret; +} + +static void qgroup_dirty(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) +{ + if (list_empty(&qgroup->dirty)) + list_add(&qgroup->dirty, &fs_info->dirty_qgroups); +} + +static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup) +{ + if (!list_empty(&qgroup->iterator)) + return; + + list_add_tail(&qgroup->iterator, head); +} + +static void qgroup_iterator_clean(struct list_head *head) +{ + while (!list_empty(head)) { + struct btrfs_qgroup *qgroup; + + qgroup = list_first_entry(head, struct btrfs_qgroup, iterator); + list_del_init(&qgroup->iterator); + } +} + +/* + * The easy accounting, we're updating qgroup relationship whose child qgroup + * only has exclusive extents. + * + * In this case, all exclusive extents will also be exclusive for parent, so + * excl/rfer just get added/removed. + * + * So is qgroup reservation space, which should also be added/removed to + * parent. + * Or when child tries to release reservation space, parent will underflow its + * reservation (for relationship adding case). + * + * Caller should hold fs_info->qgroup_lock. + */ +static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, + struct ulist *tmp, u64 ref_root, + struct btrfs_qgroup *src, int sign) +{ + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup_list *glist; + struct ulist_node *unode; + struct ulist_iterator uiter; + u64 num_bytes = src->excl; + int ret = 0; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + + qgroup->rfer += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes; + + WARN_ON(sign < 0 && qgroup->excl < num_bytes); + qgroup->excl += sign * num_bytes; + qgroup->excl_cmpr += sign * num_bytes; + + if (sign > 0) + qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); + else + qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); + + qgroup_dirty(fs_info, qgroup); + + /* Get all of the parent groups that contain this qgroup */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + qgroup_to_aux(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + + /* Iterate all of the parents and adjust their reference counts */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + qgroup = unode_aux_to_qgroup(unode); + qgroup->rfer += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes; + WARN_ON(sign < 0 && qgroup->excl < num_bytes); + qgroup->excl += sign * num_bytes; + if (sign > 0) + qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); + else + qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); + qgroup->excl_cmpr += sign * num_bytes; + qgroup_dirty(fs_info, qgroup); + + /* Add any parents of the parents */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + qgroup_to_aux(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + } + ret = 0; +out: + return ret; +} + + +/* + * Quick path for updating qgroup with only excl refs. + * + * In that case, just update all parent will be enough. + * Or we needs to do a full rescan. + * Caller should also hold fs_info->qgroup_lock. + * + * Return 0 for quick update, return >0 for need to full rescan + * and mark INCONSISTENT flag. + * Return < 0 for other error. + */ +static int quick_update_accounting(struct btrfs_fs_info *fs_info, + struct ulist *tmp, u64 src, u64 dst, + int sign) +{ + struct btrfs_qgroup *qgroup; + int ret = 1; + int err = 0; + + qgroup = find_qgroup_rb(fs_info, src); + if (!qgroup) + goto out; + if (qgroup->excl == qgroup->rfer) { + ret = 0; + err = __qgroup_excl_accounting(fs_info, tmp, dst, + qgroup, sign); + if (err < 0) { + ret = err; + goto out; + } + } +out: + if (ret) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + return ret; +} + +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, + u64 dst) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_qgroup *parent; + struct btrfs_qgroup *member; + struct btrfs_qgroup_list *list; + struct ulist *tmp; + unsigned int nofs_flag; + int ret = 0; + + /* Check the level of src and dst first */ + if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) + return -EINVAL; + + /* We hold a transaction handle open, must do a NOFS allocation. */ + nofs_flag = memalloc_nofs_save(); + tmp = ulist_alloc(GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + if (!tmp) + return -ENOMEM; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_root) { + ret = -ENOTCONN; + goto out; + } + member = find_qgroup_rb(fs_info, src); + parent = find_qgroup_rb(fs_info, dst); + if (!member || !parent) { + ret = -EINVAL; + goto out; + } + + /* check if such qgroup relation exist firstly */ + list_for_each_entry(list, &member->groups, next_group) { + if (list->group == parent) { + ret = -EEXIST; + goto out; + } + } + + ret = add_qgroup_relation_item(trans, src, dst); + if (ret) + goto out; + + ret = add_qgroup_relation_item(trans, dst, src); + if (ret) { + del_qgroup_relation_item(trans, src, dst); + goto out; + } + + spin_lock(&fs_info->qgroup_lock); + ret = __add_relation_rb(member, parent); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + goto out; + } + ret = quick_update_accounting(fs_info, tmp, src, dst, 1); + spin_unlock(&fs_info->qgroup_lock); +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + ulist_free(tmp); + return ret; +} + +static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, + u64 dst) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_qgroup *parent; + struct btrfs_qgroup *member; + struct btrfs_qgroup_list *list; + struct ulist *tmp; + bool found = false; + unsigned int nofs_flag; + int ret = 0; + int ret2; + + /* We hold a transaction handle open, must do a NOFS allocation. */ + nofs_flag = memalloc_nofs_save(); + tmp = ulist_alloc(GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + if (!tmp) + return -ENOMEM; + + if (!fs_info->quota_root) { + ret = -ENOTCONN; + goto out; + } + + member = find_qgroup_rb(fs_info, src); + parent = find_qgroup_rb(fs_info, dst); + /* + * The parent/member pair doesn't exist, then try to delete the dead + * relation items only. + */ + if (!member || !parent) + goto delete_item; + + /* check if such qgroup relation exist firstly */ + list_for_each_entry(list, &member->groups, next_group) { + if (list->group == parent) { + found = true; + break; + } + } + +delete_item: + ret = del_qgroup_relation_item(trans, src, dst); + if (ret < 0 && ret != -ENOENT) + goto out; + ret2 = del_qgroup_relation_item(trans, dst, src); + if (ret2 < 0 && ret2 != -ENOENT) + goto out; + + /* At least one deletion succeeded, return 0 */ + if (!ret || !ret2) + ret = 0; + + if (found) { + spin_lock(&fs_info->qgroup_lock); + del_relation_rb(fs_info, src, dst); + ret = quick_update_accounting(fs_info, tmp, src, dst, -1); + spin_unlock(&fs_info->qgroup_lock); + } +out: + ulist_free(tmp); + return ret; +} + +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, + u64 dst) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret = 0; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + ret = __del_qgroup_relation(trans, src, dst); + mutex_unlock(&fs_info->qgroup_ioctl_lock); + + return ret; +} + +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; + int ret = 0; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_root) { + ret = -ENOTCONN; + goto out; + } + quota_root = fs_info->quota_root; + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (qgroup) { + ret = -EEXIST; + goto out; + } + + ret = add_qgroup_item(trans, quota_root, qgroupid); + if (ret) + goto out; + + spin_lock(&fs_info->qgroup_lock); + qgroup = add_qgroup_rb(fs_info, qgroupid); + spin_unlock(&fs_info->qgroup_lock); + + if (IS_ERR(qgroup)) { + ret = PTR_ERR(qgroup); + goto out; + } + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; +} + +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup_list *list; + int ret = 0; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_root) { + ret = -ENOTCONN; + goto out; + } + + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) { + ret = -ENOENT; + goto out; + } + + /* Check if there are no children of this qgroup */ + if (!list_empty(&qgroup->members)) { + ret = -EBUSY; + goto out; + } + + ret = del_qgroup_item(trans, qgroupid); + if (ret && ret != -ENOENT) + goto out; + + while (!list_empty(&qgroup->groups)) { + list = list_first_entry(&qgroup->groups, + struct btrfs_qgroup_list, next_group); + ret = __del_qgroup_relation(trans, qgroupid, + list->group->qgroupid); + if (ret) + goto out; + } + + spin_lock(&fs_info->qgroup_lock); + del_qgroup_rb(fs_info, qgroupid); + spin_unlock(&fs_info->qgroup_lock); + + /* + * Remove the qgroup from sysfs now without holding the qgroup_lock + * spinlock, since the sysfs_remove_group() function needs to take + * the mutex kernfs_mutex through kernfs_remove_by_name_ns(). + */ + btrfs_sysfs_del_one_qgroup(fs_info, qgroup); + kfree(qgroup); +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; +} + +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, + struct btrfs_qgroup_limit *limit) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_qgroup *qgroup; + int ret = 0; + /* Sometimes we would want to clear the limit on this qgroup. + * To meet this requirement, we treat the -1 as a special value + * which tell kernel to clear the limit on this qgroup. + */ + const u64 CLEAR_VALUE = -1; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_root) { + ret = -ENOTCONN; + goto out; + } + + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) { + ret = -ENOENT; + goto out; + } + + spin_lock(&fs_info->qgroup_lock); + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) { + if (limit->max_rfer == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; + limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; + qgroup->max_rfer = 0; + } else { + qgroup->max_rfer = limit->max_rfer; + } + } + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { + if (limit->max_excl == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; + limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; + qgroup->max_excl = 0; + } else { + qgroup->max_excl = limit->max_excl; + } + } + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) { + if (limit->rsv_rfer == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; + limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; + qgroup->rsv_rfer = 0; + } else { + qgroup->rsv_rfer = limit->rsv_rfer; + } + } + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) { + if (limit->rsv_excl == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; + limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; + qgroup->rsv_excl = 0; + } else { + qgroup->rsv_excl = limit->rsv_excl; + } + } + qgroup->lim_flags |= limit->flags; + + spin_unlock(&fs_info->qgroup_lock); + + ret = update_qgroup_limit_item(trans, qgroup); + if (ret) { + qgroup_mark_inconsistent(fs_info); + btrfs_info(fs_info, "unable to update quota limit for %llu", + qgroupid); + } + +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; +} + +int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record) +{ + struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_qgroup_extent_record *entry; + u64 bytenr = record->bytenr; + + lockdep_assert_held(&delayed_refs->lock); + trace_btrfs_qgroup_trace_extent(fs_info, record); + + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, + node); + if (bytenr < entry->bytenr) { + p = &(*p)->rb_left; + } else if (bytenr > entry->bytenr) { + p = &(*p)->rb_right; + } else { + if (record->data_rsv && !entry->data_rsv) { + entry->data_rsv = record->data_rsv; + entry->data_rsv_refroot = + record->data_rsv_refroot; + } + return 1; + } + } + + rb_link_node(&record->node, parent_node, p); + rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); + return 0; +} + +int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, + struct btrfs_qgroup_extent_record *qrecord) +{ + struct btrfs_backref_walk_ctx ctx = { 0 }; + int ret; + + /* + * We are always called in a context where we are already holding a + * transaction handle. Often we are called when adding a data delayed + * reference from btrfs_truncate_inode_items() (truncating or unlinking), + * in which case we will be holding a write lock on extent buffer from a + * subvolume tree. In this case we can't allow btrfs_find_all_roots() to + * acquire fs_info->commit_root_sem, because that is a higher level lock + * that must be acquired before locking any extent buffers. + * + * So we want btrfs_find_all_roots() to not acquire the commit_root_sem + * but we can't pass it a non-NULL transaction handle, because otherwise + * it would not use commit roots and would lock extent buffers, causing + * a deadlock if it ends up trying to read lock the same extent buffer + * that was previously write locked at btrfs_truncate_inode_items(). + * + * So pass a NULL transaction handle to btrfs_find_all_roots() and + * explicitly tell it to not acquire the commit_root_sem - if we are + * holding a transaction handle we don't need its protection. + */ + ASSERT(trans != NULL); + + if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) + return 0; + + ctx.bytenr = qrecord->bytenr; + ctx.fs_info = trans->fs_info; + + ret = btrfs_find_all_roots(&ctx, true); + if (ret < 0) { + qgroup_mark_inconsistent(trans->fs_info); + btrfs_warn(trans->fs_info, +"error accounting new delayed refs extent (err code: %d), quota inconsistent", + ret); + return 0; + } + + /* + * Here we don't need to get the lock of + * trans->transaction->delayed_refs, since inserted qrecord won't + * be deleted, only qrecord->node may be modified (new qrecord insert) + * + * So modifying qrecord->old_roots is safe here + */ + qrecord->old_roots = ctx.roots; + return 0; +} + +int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) + || bytenr == 0 || num_bytes == 0) + return 0; + record = kzalloc(sizeof(*record), GFP_NOFS); + if (!record) + return -ENOMEM; + + delayed_refs = &trans->transaction->delayed_refs; + record->bytenr = bytenr; + record->num_bytes = num_bytes; + record->old_roots = NULL; + + spin_lock(&delayed_refs->lock); + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); + spin_unlock(&delayed_refs->lock); + if (ret > 0) { + kfree(record); + return 0; + } + return btrfs_qgroup_trace_extent_post(trans, record); +} + +int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, + struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int nr = btrfs_header_nritems(eb); + int i, extent_type, ret; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + u64 bytenr, num_bytes; + + /* We can be called directly from walk_up_proc() */ + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + + for (i = 0; i < nr; i++) { + btrfs_item_key_to_cpu(eb, &key, i); + + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + + fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + /* filter out non qgroup-accountable extents */ + extent_type = btrfs_file_extent_type(eb, fi); + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + + bytenr = btrfs_file_extent_disk_bytenr(eb, fi); + if (!bytenr) + continue; + + num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); + + ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes); + if (ret) + return ret; + } + cond_resched(); + return 0; +} + +/* + * Walk up the tree from the bottom, freeing leaves and any interior + * nodes which have had all slots visited. If a node (leaf or + * interior) is freed, the node above it will have it's slot + * incremented. The root node will never be freed. + * + * At the end of this function, we should have a path which has all + * slots incremented to the next position for a search. If we need to + * read a new node it will be NULL and the node above it will have the + * correct slot selected for a later read. + * + * If we increment the root nodes slot counter past the number of + * elements, 1 is returned to signal completion of the search. + */ +static int adjust_slots_upwards(struct btrfs_path *path, int root_level) +{ + int level = 0; + int nr, slot; + struct extent_buffer *eb; + + if (root_level == 0) + return 1; + + while (level <= root_level) { + eb = path->nodes[level]; + nr = btrfs_header_nritems(eb); + path->slots[level]++; + slot = path->slots[level]; + if (slot >= nr || level == 0) { + /* + * Don't free the root - we will detect this + * condition after our loop and return a + * positive value for caller to stop walking the tree. + */ + if (level != root_level) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + + free_extent_buffer(eb); + path->nodes[level] = NULL; + path->slots[level] = 0; + } + } else { + /* + * We have a valid slot to walk back down + * from. Stop here so caller can process these + * new nodes. + */ + break; + } + + level++; + } + + eb = path->nodes[root_level]; + if (path->slots[root_level] >= btrfs_header_nritems(eb)) + return 1; + + return 0; +} + +/* + * Helper function to trace a subtree tree block swap. + * + * The swap will happen in highest tree block, but there may be a lot of + * tree blocks involved. + * + * For example: + * OO = Old tree blocks + * NN = New tree blocks allocated during balance + * + * File tree (257) Reloc tree for 257 + * L2 OO NN + * / \ / \ + * L1 OO OO (a) OO NN (a) + * / \ / \ / \ / \ + * L0 OO OO OO OO OO OO NN NN + * (b) (c) (b) (c) + * + * When calling qgroup_trace_extent_swap(), we will pass: + * @src_eb = OO(a) + * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] + * @dst_level = 0 + * @root_level = 1 + * + * In that case, qgroup_trace_extent_swap() will search from OO(a) to + * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. + * + * The main work of qgroup_trace_extent_swap() can be split into 3 parts: + * + * 1) Tree search from @src_eb + * It should acts as a simplified btrfs_search_slot(). + * The key for search can be extracted from @dst_path->nodes[dst_level] + * (first key). + * + * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty + * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. + * They should be marked during previous (@dst_level = 1) iteration. + * + * 3) Mark file extents in leaves dirty + * We don't have good way to pick out new file extents only. + * So we still follow the old method by scanning all file extents in + * the leave. + * + * This function can free us from keeping two paths, thus later we only need + * to care about how to iterate all new tree blocks in reloc tree. + */ +static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, + struct extent_buffer *src_eb, + struct btrfs_path *dst_path, + int dst_level, int root_level, + bool trace_leaf) +{ + struct btrfs_key key; + struct btrfs_path *src_path; + struct btrfs_fs_info *fs_info = trans->fs_info; + u32 nodesize = fs_info->nodesize; + int cur_level = root_level; + int ret; + + BUG_ON(dst_level > root_level); + /* Level mismatch */ + if (btrfs_header_level(src_eb) != root_level) + return -EINVAL; + + src_path = btrfs_alloc_path(); + if (!src_path) { + ret = -ENOMEM; + goto out; + } + + if (dst_level) + btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); + else + btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); + + /* For src_path */ + atomic_inc(&src_eb->refs); + src_path->nodes[root_level] = src_eb; + src_path->slots[root_level] = dst_path->slots[root_level]; + src_path->locks[root_level] = 0; + + /* A simplified version of btrfs_search_slot() */ + while (cur_level >= dst_level) { + struct btrfs_key src_key; + struct btrfs_key dst_key; + + if (src_path->nodes[cur_level] == NULL) { + struct extent_buffer *eb; + int parent_slot; + + eb = src_path->nodes[cur_level + 1]; + parent_slot = src_path->slots[cur_level + 1]; + + eb = btrfs_read_node_slot(eb, parent_slot); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } + + src_path->nodes[cur_level] = eb; + + btrfs_tree_read_lock(eb); + src_path->locks[cur_level] = BTRFS_READ_LOCK; + } + + src_path->slots[cur_level] = dst_path->slots[cur_level]; + if (cur_level) { + btrfs_node_key_to_cpu(dst_path->nodes[cur_level], + &dst_key, dst_path->slots[cur_level]); + btrfs_node_key_to_cpu(src_path->nodes[cur_level], + &src_key, src_path->slots[cur_level]); + } else { + btrfs_item_key_to_cpu(dst_path->nodes[cur_level], + &dst_key, dst_path->slots[cur_level]); + btrfs_item_key_to_cpu(src_path->nodes[cur_level], + &src_key, src_path->slots[cur_level]); + } + /* Content mismatch, something went wrong */ + if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { + ret = -ENOENT; + goto out; + } + cur_level--; + } + + /* + * Now both @dst_path and @src_path have been populated, record the tree + * blocks for qgroup accounting. + */ + ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, + nodesize); + if (ret < 0) + goto out; + ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start, + nodesize); + if (ret < 0) + goto out; + + /* Record leaf file extents */ + if (dst_level == 0 && trace_leaf) { + ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); + if (ret < 0) + goto out; + ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); + } +out: + btrfs_free_path(src_path); + return ret; +} + +/* + * Helper function to do recursive generation-aware depth-first search, to + * locate all new tree blocks in a subtree of reloc tree. + * + * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot) + * reloc tree + * L2 NN (a) + * / \ + * L1 OO NN (b) + * / \ / \ + * L0 OO OO OO NN + * (c) (d) + * If we pass: + * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ], + * @cur_level = 1 + * @root_level = 1 + * + * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace + * above tree blocks along with their counter parts in file tree. + * While during search, old tree blocks OO(c) will be skipped as tree block swap + * won't affect OO(c). + */ +static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, + struct extent_buffer *src_eb, + struct btrfs_path *dst_path, + int cur_level, int root_level, + u64 last_snapshot, bool trace_leaf) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct extent_buffer *eb; + bool need_cleanup = false; + int ret = 0; + int i; + + /* Level sanity check */ + if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || + root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || + root_level < cur_level) { + btrfs_err_rl(fs_info, + "%s: bad levels, cur_level=%d root_level=%d", + __func__, cur_level, root_level); + return -EUCLEAN; + } + + /* Read the tree block if needed */ + if (dst_path->nodes[cur_level] == NULL) { + int parent_slot; + u64 child_gen; + + /* + * dst_path->nodes[root_level] must be initialized before + * calling this function. + */ + if (cur_level == root_level) { + btrfs_err_rl(fs_info, + "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", + __func__, root_level, root_level, cur_level); + return -EUCLEAN; + } + + /* + * We need to get child blockptr/gen from parent before we can + * read it. + */ + eb = dst_path->nodes[cur_level + 1]; + parent_slot = dst_path->slots[cur_level + 1]; + child_gen = btrfs_node_ptr_generation(eb, parent_slot); + + /* This node is old, no need to trace */ + if (child_gen < last_snapshot) + goto out; + + eb = btrfs_read_node_slot(eb, parent_slot); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } + + dst_path->nodes[cur_level] = eb; + dst_path->slots[cur_level] = 0; + + btrfs_tree_read_lock(eb); + dst_path->locks[cur_level] = BTRFS_READ_LOCK; + need_cleanup = true; + } + + /* Now record this tree block and its counter part for qgroups */ + ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level, + root_level, trace_leaf); + if (ret < 0) + goto cleanup; + + eb = dst_path->nodes[cur_level]; + + if (cur_level > 0) { + /* Iterate all child tree blocks */ + for (i = 0; i < btrfs_header_nritems(eb); i++) { + /* Skip old tree blocks as they won't be swapped */ + if (btrfs_node_ptr_generation(eb, i) < last_snapshot) + continue; + dst_path->slots[cur_level] = i; + + /* Recursive call (at most 7 times) */ + ret = qgroup_trace_new_subtree_blocks(trans, src_eb, + dst_path, cur_level - 1, root_level, + last_snapshot, trace_leaf); + if (ret < 0) + goto cleanup; + } + } + +cleanup: + if (need_cleanup) { + /* Clean up */ + btrfs_tree_unlock_rw(dst_path->nodes[cur_level], + dst_path->locks[cur_level]); + free_extent_buffer(dst_path->nodes[cur_level]); + dst_path->nodes[cur_level] = NULL; + dst_path->slots[cur_level] = 0; + dst_path->locks[cur_level] = 0; + } +out: + return ret; +} + +static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, + struct extent_buffer *src_eb, + struct extent_buffer *dst_eb, + u64 last_snapshot, bool trace_leaf) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_path *dst_path = NULL; + int level; + int ret; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + + /* Wrong parameter order */ + if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { + btrfs_err_rl(fs_info, + "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, + btrfs_header_generation(src_eb), + btrfs_header_generation(dst_eb)); + return -EUCLEAN; + } + + if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { + ret = -EIO; + goto out; + } + + level = btrfs_header_level(dst_eb); + dst_path = btrfs_alloc_path(); + if (!dst_path) { + ret = -ENOMEM; + goto out; + } + /* For dst_path */ + atomic_inc(&dst_eb->refs); + dst_path->nodes[level] = dst_eb; + dst_path->slots[level] = 0; + dst_path->locks[level] = 0; + + /* Do the generation aware breadth-first search */ + ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, + level, last_snapshot, trace_leaf); + if (ret < 0) + goto out; + ret = 0; + +out: + btrfs_free_path(dst_path); + if (ret < 0) + qgroup_mark_inconsistent(fs_info); + return ret; +} + +int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, + struct extent_buffer *root_eb, + u64 root_gen, int root_level) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret = 0; + int level; + u8 drop_subptree_thres; + struct extent_buffer *eb = root_eb; + struct btrfs_path *path = NULL; + + BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); + BUG_ON(root_eb == NULL); + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + + spin_lock(&fs_info->qgroup_lock); + drop_subptree_thres = fs_info->qgroup_drop_subtree_thres; + spin_unlock(&fs_info->qgroup_lock); + + /* + * This function only gets called for snapshot drop, if we hit a high + * node here, it means we are going to change ownership for quite a lot + * of extents, which will greatly slow down btrfs_commit_transaction(). + * + * So here if we find a high tree here, we just skip the accounting and + * mark qgroup inconsistent. + */ + if (root_level >= drop_subptree_thres) { + qgroup_mark_inconsistent(fs_info); + return 0; + } + + if (!extent_buffer_uptodate(root_eb)) { + struct btrfs_tree_parent_check check = { + .has_first_key = false, + .transid = root_gen, + .level = root_level + }; + + ret = btrfs_read_extent_buffer(root_eb, &check); + if (ret) + goto out; + } + + if (root_level == 0) { + ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Walk down the tree. Missing extent blocks are filled in as + * we go. Metadata is accounted every time we read a new + * extent block. + * + * When we reach a leaf, we account for file extent items in it, + * walk back up the tree (adjusting slot pointers as we go) + * and restart the search process. + */ + atomic_inc(&root_eb->refs); /* For path */ + path->nodes[root_level] = root_eb; + path->slots[root_level] = 0; + path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ +walk_down: + level = root_level; + while (level >= 0) { + if (path->nodes[level] == NULL) { + int parent_slot; + u64 child_bytenr; + + /* + * We need to get child blockptr from parent before we + * can read it. + */ + eb = path->nodes[level + 1]; + parent_slot = path->slots[level + 1]; + child_bytenr = btrfs_node_blockptr(eb, parent_slot); + + eb = btrfs_read_node_slot(eb, parent_slot); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } + + path->nodes[level] = eb; + path->slots[level] = 0; + + btrfs_tree_read_lock(eb); + path->locks[level] = BTRFS_READ_LOCK; + + ret = btrfs_qgroup_trace_extent(trans, child_bytenr, + fs_info->nodesize); + if (ret) + goto out; + } + + if (level == 0) { + ret = btrfs_qgroup_trace_leaf_items(trans, + path->nodes[level]); + if (ret) + goto out; + + /* Nonzero return here means we completed our search */ + ret = adjust_slots_upwards(path, root_level); + if (ret) + break; + + /* Restart search with new slots */ + goto walk_down; + } + + level--; + } + + ret = 0; +out: + btrfs_free_path(path); + + return ret; +} + +#define UPDATE_NEW 0 +#define UPDATE_OLD 1 +/* + * Walk all of the roots that points to the bytenr and adjust their refcnts. + */ +static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, + struct ulist *roots, struct ulist *tmp, + struct ulist *qgroups, u64 seq, int update_old) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct ulist_node *tmp_unode; + struct ulist_iterator tmp_uiter; + struct btrfs_qgroup *qg; + int ret = 0; + + if (!roots) + return 0; + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + qg = find_qgroup_rb(fs_info, unode->val); + if (!qg) + continue; + + ulist_reinit(tmp); + ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC); + if (ret < 0) + return ret; + ULIST_ITER_INIT(&tmp_uiter); + while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { + struct btrfs_qgroup_list *glist; + + qg = unode_aux_to_qgroup(tmp_unode); + if (update_old) + btrfs_qgroup_update_old_refcnt(qg, seq, 1); + else + btrfs_qgroup_update_new_refcnt(qg, seq, 1); + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(qgroups, glist->group->qgroupid, + qgroup_to_aux(glist->group), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, glist->group->qgroupid, + qgroup_to_aux(glist->group), + GFP_ATOMIC); + if (ret < 0) + return ret; + } + } + } + return 0; +} + +/* + * Update qgroup rfer/excl counters. + * Rfer update is easy, codes can explain themselves. + * + * Excl update is tricky, the update is split into 2 parts. + * Part 1: Possible exclusive <-> sharing detect: + * | A | !A | + * ------------------------------------- + * B | * | - | + * ------------------------------------- + * !B | + | ** | + * ------------------------------------- + * + * Conditions: + * A: cur_old_roots < nr_old_roots (not exclusive before) + * !A: cur_old_roots == nr_old_roots (possible exclusive before) + * B: cur_new_roots < nr_new_roots (not exclusive now) + * !B: cur_new_roots == nr_new_roots (possible exclusive now) + * + * Results: + * +: Possible sharing -> exclusive -: Possible exclusive -> sharing + * *: Definitely not changed. **: Possible unchanged. + * + * For !A and !B condition, the exception is cur_old/new_roots == 0 case. + * + * To make the logic clear, we first use condition A and B to split + * combination into 4 results. + * + * Then, for result "+" and "-", check old/new_roots == 0 case, as in them + * only on variant maybe 0. + * + * Lastly, check result **, since there are 2 variants maybe 0, split them + * again(2x2). + * But this time we don't need to consider other things, the codes and logic + * is easy to understand now. + */ +static int qgroup_update_counters(struct btrfs_fs_info *fs_info, + struct ulist *qgroups, + u64 nr_old_roots, + u64 nr_new_roots, + u64 num_bytes, u64 seq) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup *qg; + u64 cur_new_count, cur_old_count; + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(qgroups, &uiter))) { + bool dirty = false; + + qg = unode_aux_to_qgroup(unode); + cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); + cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); + + trace_qgroup_update_counters(fs_info, qg, cur_old_count, + cur_new_count); + + /* Rfer update part */ + if (cur_old_count == 0 && cur_new_count > 0) { + qg->rfer += num_bytes; + qg->rfer_cmpr += num_bytes; + dirty = true; + } + if (cur_old_count > 0 && cur_new_count == 0) { + qg->rfer -= num_bytes; + qg->rfer_cmpr -= num_bytes; + dirty = true; + } + + /* Excl update part */ + /* Exclusive/none -> shared case */ + if (cur_old_count == nr_old_roots && + cur_new_count < nr_new_roots) { + /* Exclusive -> shared */ + if (cur_old_count != 0) { + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } + } + + /* Shared -> exclusive/none case */ + if (cur_old_count < nr_old_roots && + cur_new_count == nr_new_roots) { + /* Shared->exclusive */ + if (cur_new_count != 0) { + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; + } + } + + /* Exclusive/none -> exclusive/none case */ + if (cur_old_count == nr_old_roots && + cur_new_count == nr_new_roots) { + if (cur_old_count == 0) { + /* None -> exclusive/none */ + + if (cur_new_count != 0) { + /* None -> exclusive */ + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; + } + /* None -> none, nothing changed */ + } else { + /* Exclusive -> exclusive/none */ + + if (cur_new_count == 0) { + /* Exclusive -> none */ + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } + /* Exclusive -> exclusive, nothing changed */ + } + } + + if (dirty) + qgroup_dirty(fs_info, qg); + } + return 0; +} + +/* + * Check if the @roots potentially is a list of fs tree roots + * + * Return 0 for definitely not a fs/subvol tree roots ulist + * Return 1 for possible fs/subvol tree roots in the list (considering an empty + * one as well) + */ +static int maybe_fs_roots(struct ulist *roots) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + + /* Empty one, still possible for fs roots */ + if (!roots || roots->nnodes == 0) + return 1; + + ULIST_ITER_INIT(&uiter); + unode = ulist_next(roots, &uiter); + if (!unode) + return 1; + + /* + * If it contains fs tree roots, then it must belong to fs/subvol + * trees. + * If it contains a non-fs tree, it won't be shared with fs/subvol trees. + */ + return is_fstree(unode->val); +} + +int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, + u64 num_bytes, struct ulist *old_roots, + struct ulist *new_roots) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct ulist *qgroups = NULL; + struct ulist *tmp = NULL; + u64 seq; + u64 nr_new_roots = 0; + u64 nr_old_roots = 0; + int ret = 0; + + /* + * If quotas get disabled meanwhile, the resources need to be freed and + * we can't just exit here. + */ + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) + goto out_free; + + if (new_roots) { + if (!maybe_fs_roots(new_roots)) + goto out_free; + nr_new_roots = new_roots->nnodes; + } + if (old_roots) { + if (!maybe_fs_roots(old_roots)) + goto out_free; + nr_old_roots = old_roots->nnodes; + } + + /* Quick exit, either not fs tree roots, or won't affect any qgroup */ + if (nr_old_roots == 0 && nr_new_roots == 0) + goto out_free; + + BUG_ON(!fs_info->quota_root); + + trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, + num_bytes, nr_old_roots, nr_new_roots); + + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) { + ret = -ENOMEM; + goto out_free; + } + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) { + ret = -ENOMEM; + goto out_free; + } + + mutex_lock(&fs_info->qgroup_rescan_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { + mutex_unlock(&fs_info->qgroup_rescan_lock); + ret = 0; + goto out_free; + } + } + mutex_unlock(&fs_info->qgroup_rescan_lock); + + spin_lock(&fs_info->qgroup_lock); + seq = fs_info->qgroup_seq; + + /* Update old refcnts using old_roots */ + ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, + UPDATE_OLD); + if (ret < 0) + goto out; + + /* Update new refcnts using new_roots */ + ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, + UPDATE_NEW); + if (ret < 0) + goto out; + + qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, + num_bytes, seq); + + /* + * Bump qgroup_seq to avoid seq overlap + */ + fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; +out: + spin_unlock(&fs_info->qgroup_lock); +out_free: + ulist_free(tmp); + ulist_free(qgroups); + ulist_free(old_roots); + ulist_free(new_roots); + return ret; +} + +int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + struct ulist *new_roots = NULL; + struct rb_node *node; + u64 num_dirty_extents = 0; + u64 qgroup_to_skip; + int ret = 0; + + delayed_refs = &trans->transaction->delayed_refs; + qgroup_to_skip = delayed_refs->qgroup_to_skip; + while ((node = rb_first(&delayed_refs->dirty_extent_root))) { + record = rb_entry(node, struct btrfs_qgroup_extent_record, + node); + + num_dirty_extents++; + trace_btrfs_qgroup_account_extents(fs_info, record); + + if (!ret && !(fs_info->qgroup_flags & + BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { + struct btrfs_backref_walk_ctx ctx = { 0 }; + + ctx.bytenr = record->bytenr; + ctx.fs_info = fs_info; + + /* + * Old roots should be searched when inserting qgroup + * extent record. + * + * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case, + * we may have some record inserted during + * NO_ACCOUNTING (thus no old_roots populated), but + * later we start rescan, which clears NO_ACCOUNTING, + * leaving some inserted records without old_roots + * populated. + * + * Those cases are rare and should not cause too much + * time spent during commit_transaction(). + */ + if (!record->old_roots) { + /* Search commit root to find old_roots */ + ret = btrfs_find_all_roots(&ctx, false); + if (ret < 0) + goto cleanup; + record->old_roots = ctx.roots; + ctx.roots = NULL; + } + + /* Free the reserved data space */ + btrfs_qgroup_free_refroot(fs_info, + record->data_rsv_refroot, + record->data_rsv, + BTRFS_QGROUP_RSV_DATA); + /* + * Use BTRFS_SEQ_LAST as time_seq to do special search, + * which doesn't lock tree or delayed_refs and search + * current root. It's safe inside commit_transaction(). + */ + ctx.trans = trans; + ctx.time_seq = BTRFS_SEQ_LAST; + ret = btrfs_find_all_roots(&ctx, false); + if (ret < 0) + goto cleanup; + new_roots = ctx.roots; + if (qgroup_to_skip) { + ulist_del(new_roots, qgroup_to_skip, 0); + ulist_del(record->old_roots, qgroup_to_skip, + 0); + } + ret = btrfs_qgroup_account_extent(trans, record->bytenr, + record->num_bytes, + record->old_roots, + new_roots); + record->old_roots = NULL; + new_roots = NULL; + } +cleanup: + ulist_free(record->old_roots); + ulist_free(new_roots); + new_roots = NULL; + rb_erase(node, &delayed_refs->dirty_extent_root); + kfree(record); + + } + trace_qgroup_num_dirty_extents(fs_info, trans->transid, + num_dirty_extents); + return ret; +} + +/* + * Writes all changed qgroups to disk. + * Called by the transaction commit path and the qgroup assign ioctl. + */ +int btrfs_run_qgroups(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret = 0; + + /* + * In case we are called from the qgroup assign ioctl, assert that we + * are holding the qgroup_ioctl_lock, otherwise we can race with a quota + * disable operation (ioctl) and access a freed quota root. + */ + if (trans->transaction->state != TRANS_STATE_COMMIT_DOING) + lockdep_assert_held(&fs_info->qgroup_ioctl_lock); + + if (!fs_info->quota_root) + return ret; + + spin_lock(&fs_info->qgroup_lock); + while (!list_empty(&fs_info->dirty_qgroups)) { + struct btrfs_qgroup *qgroup; + qgroup = list_first_entry(&fs_info->dirty_qgroups, + struct btrfs_qgroup, dirty); + list_del_init(&qgroup->dirty); + spin_unlock(&fs_info->qgroup_lock); + ret = update_qgroup_info_item(trans, qgroup); + if (ret) + qgroup_mark_inconsistent(fs_info); + ret = update_qgroup_limit_item(trans, qgroup); + if (ret) + qgroup_mark_inconsistent(fs_info); + spin_lock(&fs_info->qgroup_lock); + } + if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; + else + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; + spin_unlock(&fs_info->qgroup_lock); + + ret = update_qgroup_status_item(trans); + if (ret) + qgroup_mark_inconsistent(fs_info); + + return ret; +} + +/* + * Copy the accounting information between qgroups. This is necessary + * when a snapshot or a subvolume is created. Throwing an error will + * cause a transaction abort so we take extra care here to only error + * when a readonly fs is a reasonable outcome. + */ +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, + u64 objectid, struct btrfs_qgroup_inherit *inherit) +{ + int ret = 0; + int i; + u64 *i_qgroups; + bool committing = false; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *quota_root; + struct btrfs_qgroup *srcgroup; + struct btrfs_qgroup *dstgroup; + bool need_rescan = false; + u32 level_size = 0; + u64 nums; + + /* + * There are only two callers of this function. + * + * One in create_subvol() in the ioctl context, which needs to hold + * the qgroup_ioctl_lock. + * + * The other one in create_pending_snapshot() where no other qgroup + * code can modify the fs as they all need to either start a new trans + * or hold a trans handler, thus we don't need to hold + * qgroup_ioctl_lock. + * This would avoid long and complex lock chain and make lockdep happy. + */ + spin_lock(&fs_info->trans_lock); + if (trans->transaction->state == TRANS_STATE_COMMIT_DOING) + committing = true; + spin_unlock(&fs_info->trans_lock); + + if (!committing) + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + goto out; + + quota_root = fs_info->quota_root; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + + if (inherit) { + i_qgroups = (u64 *)(inherit + 1); + nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + + 2 * inherit->num_excl_copies; + for (i = 0; i < nums; ++i) { + srcgroup = find_qgroup_rb(fs_info, *i_qgroups); + + /* + * Zero out invalid groups so we can ignore + * them later. + */ + if (!srcgroup || + ((srcgroup->qgroupid >> 48) <= (objectid >> 48))) + *i_qgroups = 0ULL; + + ++i_qgroups; + } + } + + /* + * create a tracking group for the subvol itself + */ + ret = add_qgroup_item(trans, quota_root, objectid); + if (ret) + goto out; + + /* + * add qgroup to all inherited groups + */ + if (inherit) { + i_qgroups = (u64 *)(inherit + 1); + for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { + if (*i_qgroups == 0) + continue; + ret = add_qgroup_relation_item(trans, objectid, + *i_qgroups); + if (ret && ret != -EEXIST) + goto out; + ret = add_qgroup_relation_item(trans, *i_qgroups, + objectid); + if (ret && ret != -EEXIST) + goto out; + } + ret = 0; + } + + + spin_lock(&fs_info->qgroup_lock); + + dstgroup = add_qgroup_rb(fs_info, objectid); + if (IS_ERR(dstgroup)) { + ret = PTR_ERR(dstgroup); + goto unlock; + } + + if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { + dstgroup->lim_flags = inherit->lim.flags; + dstgroup->max_rfer = inherit->lim.max_rfer; + dstgroup->max_excl = inherit->lim.max_excl; + dstgroup->rsv_rfer = inherit->lim.rsv_rfer; + dstgroup->rsv_excl = inherit->lim.rsv_excl; + + qgroup_dirty(fs_info, dstgroup); + } + + if (srcid) { + srcgroup = find_qgroup_rb(fs_info, srcid); + if (!srcgroup) + goto unlock; + + /* + * We call inherit after we clone the root in order to make sure + * our counts don't go crazy, so at this point the only + * difference between the two roots should be the root node. + */ + level_size = fs_info->nodesize; + dstgroup->rfer = srcgroup->rfer; + dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; + dstgroup->excl = level_size; + dstgroup->excl_cmpr = level_size; + srcgroup->excl = level_size; + srcgroup->excl_cmpr = level_size; + + /* inherit the limit info */ + dstgroup->lim_flags = srcgroup->lim_flags; + dstgroup->max_rfer = srcgroup->max_rfer; + dstgroup->max_excl = srcgroup->max_excl; + dstgroup->rsv_rfer = srcgroup->rsv_rfer; + dstgroup->rsv_excl = srcgroup->rsv_excl; + + qgroup_dirty(fs_info, dstgroup); + qgroup_dirty(fs_info, srcgroup); + } + + if (!inherit) + goto unlock; + + i_qgroups = (u64 *)(inherit + 1); + for (i = 0; i < inherit->num_qgroups; ++i) { + if (*i_qgroups) { + ret = add_relation_rb(fs_info, objectid, *i_qgroups); + if (ret) + goto unlock; + } + ++i_qgroups; + + /* + * If we're doing a snapshot, and adding the snapshot to a new + * qgroup, the numbers are guaranteed to be incorrect. + */ + if (srcid) + need_rescan = true; + } + + for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { + struct btrfs_qgroup *src; + struct btrfs_qgroup *dst; + + if (!i_qgroups[0] || !i_qgroups[1]) + continue; + + src = find_qgroup_rb(fs_info, i_qgroups[0]); + dst = find_qgroup_rb(fs_info, i_qgroups[1]); + + if (!src || !dst) { + ret = -EINVAL; + goto unlock; + } + + dst->rfer = src->rfer - level_size; + dst->rfer_cmpr = src->rfer_cmpr - level_size; + + /* Manually tweaking numbers certainly needs a rescan */ + need_rescan = true; + } + for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { + struct btrfs_qgroup *src; + struct btrfs_qgroup *dst; + + if (!i_qgroups[0] || !i_qgroups[1]) + continue; + + src = find_qgroup_rb(fs_info, i_qgroups[0]); + dst = find_qgroup_rb(fs_info, i_qgroups[1]); + + if (!src || !dst) { + ret = -EINVAL; + goto unlock; + } + + dst->excl = src->excl + level_size; + dst->excl_cmpr = src->excl_cmpr + level_size; + need_rescan = true; + } + +unlock: + spin_unlock(&fs_info->qgroup_lock); + if (!ret) + ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup); +out: + if (!committing) + mutex_unlock(&fs_info->qgroup_ioctl_lock); + if (need_rescan) + qgroup_mark_inconsistent(fs_info); + return ret; +} + +static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) +{ + if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && + qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) + return false; + + if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && + qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) + return false; + + return true; +} + +static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, + enum btrfs_qgroup_rsv_type type) +{ + struct btrfs_qgroup *qgroup; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 ref_root = root->root_key.objectid; + int ret = 0; + LIST_HEAD(qgroup_list); + + if (!is_fstree(ref_root)) + return 0; + + if (num_bytes == 0) + return 0; + + if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) && + capable(CAP_SYS_RESOURCE)) + enforce = false; + + spin_lock(&fs_info->qgroup_lock); + if (!fs_info->quota_root) + goto out; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(qgroup, &qgroup_list, iterator) { + struct btrfs_qgroup_list *glist; + + if (enforce && !qgroup_check_limits(qgroup, num_bytes)) { + ret = -EDQUOT; + goto out; + } + + list_for_each_entry(glist, &qgroup->groups, next_group) + qgroup_iterator_add(&qgroup_list, glist->group); + } + + ret = 0; + /* + * no limits exceeded, now record the reservation into all qgroups + */ + list_for_each_entry(qgroup, &qgroup_list, iterator) + qgroup_rsv_add(fs_info, qgroup, num_bytes, type); + +out: + qgroup_iterator_clean(&qgroup_list); + spin_unlock(&fs_info->qgroup_lock); + return ret; +} + +/* + * Free @num_bytes of reserved space with @type for qgroup. (Normally level 0 + * qgroup). + * + * Will handle all higher level qgroup too. + * + * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup. + * This special case is only used for META_PERTRANS type. + */ +void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, + u64 ref_root, u64 num_bytes, + enum btrfs_qgroup_rsv_type type) +{ + struct btrfs_qgroup *qgroup; + struct ulist_node *unode; + struct ulist_iterator uiter; + int ret = 0; + + if (!is_fstree(ref_root)) + return; + + if (num_bytes == 0) + return; + + if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) { + WARN(1, "%s: Invalid type to free", __func__); + return; + } + spin_lock(&fs_info->qgroup_lock); + + if (!fs_info->quota_root) + goto out; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + + if (num_bytes == (u64)-1) + /* + * We're freeing all pertrans rsv, get reserved value from + * level 0 qgroup as real num_bytes to free. + */ + num_bytes = qgroup->rsv.values[type]; + + ulist_reinit(fs_info->qgroup_ulist); + ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, + qgroup_to_aux(qgroup), GFP_ATOMIC); + if (ret < 0) + goto out; + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { + struct btrfs_qgroup *qg; + struct btrfs_qgroup_list *glist; + + qg = unode_aux_to_qgroup(unode); + + qgroup_rsv_release(fs_info, qg, num_bytes, type); + + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(fs_info->qgroup_ulist, + glist->group->qgroupid, + qgroup_to_aux(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + } + +out: + spin_unlock(&fs_info->qgroup_lock); +} + +/* + * Check if the leaf is the last leaf. Which means all node pointers + * are at their last position. + */ +static bool is_last_leaf(struct btrfs_path *path) +{ + int i; + + for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { + if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1) + return false; + } + return true; +} + +/* + * returns < 0 on error, 0 when more leafs are to be scanned. + * returns 1 when done. + */ +static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *extent_root; + struct btrfs_key found; + struct extent_buffer *scratch_leaf = NULL; + u64 num_bytes; + bool done; + int slot; + int ret; + + mutex_lock(&fs_info->qgroup_rescan_lock); + extent_root = btrfs_extent_root(fs_info, + fs_info->qgroup_rescan_progress.objectid); + ret = btrfs_search_slot_for_read(extent_root, + &fs_info->qgroup_rescan_progress, + path, 1, 0); + + btrfs_debug(fs_info, + "current progress key (%llu %u %llu), search_slot ret %d", + fs_info->qgroup_rescan_progress.objectid, + fs_info->qgroup_rescan_progress.type, + fs_info->qgroup_rescan_progress.offset, ret); + + if (ret) { + /* + * The rescan is about to end, we will not be scanning any + * further blocks. We cannot unset the RESCAN flag here, because + * we want to commit the transaction if everything went well. + * To make the live accounting work in this phase, we set our + * scan progress pointer such that every real extent objectid + * will be smaller. + */ + fs_info->qgroup_rescan_progress.objectid = (u64)-1; + btrfs_release_path(path); + mutex_unlock(&fs_info->qgroup_rescan_lock); + return ret; + } + done = is_last_leaf(path); + + btrfs_item_key_to_cpu(path->nodes[0], &found, + btrfs_header_nritems(path->nodes[0]) - 1); + fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; + + scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]); + if (!scratch_leaf) { + ret = -ENOMEM; + mutex_unlock(&fs_info->qgroup_rescan_lock); + goto out; + } + slot = path->slots[0]; + btrfs_release_path(path); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { + struct btrfs_backref_walk_ctx ctx = { 0 }; + + btrfs_item_key_to_cpu(scratch_leaf, &found, slot); + if (found.type != BTRFS_EXTENT_ITEM_KEY && + found.type != BTRFS_METADATA_ITEM_KEY) + continue; + if (found.type == BTRFS_METADATA_ITEM_KEY) + num_bytes = fs_info->nodesize; + else + num_bytes = found.offset; + + ctx.bytenr = found.objectid; + ctx.fs_info = fs_info; + + ret = btrfs_find_all_roots(&ctx, false); + if (ret < 0) + goto out; + /* For rescan, just pass old_roots as NULL */ + ret = btrfs_qgroup_account_extent(trans, found.objectid, + num_bytes, NULL, ctx.roots); + if (ret < 0) + goto out; + } +out: + if (scratch_leaf) + free_extent_buffer(scratch_leaf); + + if (done && !ret) { + ret = 1; + fs_info->qgroup_rescan_progress.objectid = (u64)-1; + } + return ret; +} + +static bool rescan_should_stop(struct btrfs_fs_info *fs_info) +{ + return btrfs_fs_closing(fs_info) || + test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) || + !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; +} + +static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, + qgroup_rescan_work); + struct btrfs_path *path; + struct btrfs_trans_handle *trans = NULL; + int err = -ENOMEM; + int ret = 0; + bool stopped = false; + bool did_leaf_rescans = false; + + path = btrfs_alloc_path(); + if (!path) + goto out; + /* + * Rescan should only search for commit root, and any later difference + * should be recorded by qgroup + */ + path->search_commit_root = 1; + path->skip_locking = 1; + + err = 0; + while (!err && !(stopped = rescan_should_stop(fs_info))) { + trans = btrfs_start_transaction(fs_info->fs_root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + break; + } + + err = qgroup_rescan_leaf(trans, path); + did_leaf_rescans = true; + + if (err > 0) + btrfs_commit_transaction(trans); + else + btrfs_end_transaction(trans); + } + +out: + btrfs_free_path(path); + + mutex_lock(&fs_info->qgroup_rescan_lock); + if (err > 0 && + fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } else if (err < 0 || stopped) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } + mutex_unlock(&fs_info->qgroup_rescan_lock); + + /* + * Only update status, since the previous part has already updated the + * qgroup info, and only if we did any actual work. This also prevents + * race with a concurrent quota disable, which has already set + * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at + * btrfs_quota_disable(). + */ + if (did_leaf_rescans) { + trans = btrfs_start_transaction(fs_info->quota_root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + trans = NULL; + btrfs_err(fs_info, + "fail to start transaction for status update: %d", + err); + } + } else { + trans = NULL; + } + + mutex_lock(&fs_info->qgroup_rescan_lock); + if (!stopped || + fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + if (trans) { + ret = update_qgroup_status_item(trans); + if (ret < 0) { + err = ret; + btrfs_err(fs_info, "fail to update qgroup status: %d", + err); + } + } + fs_info->qgroup_rescan_running = false; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; + complete_all(&fs_info->qgroup_rescan_completion); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (!trans) + return; + + btrfs_end_transaction(trans); + + if (stopped) { + btrfs_info(fs_info, "qgroup scan paused"); + } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) { + btrfs_info(fs_info, "qgroup scan cancelled"); + } else if (err >= 0) { + btrfs_info(fs_info, "qgroup scan completed%s", + err > 0 ? " (inconsistency flag cleared)" : ""); + } else { + btrfs_err(fs_info, "qgroup scan failed with %d", err); + } +} + +/* + * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all + * memory required for the rescan context. + */ +static int +qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, + int init_flags) +{ + int ret = 0; + + if (!init_flags) { + /* we're resuming qgroup rescan at mount time */ + if (!(fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { + btrfs_warn(fs_info, + "qgroup rescan init failed, qgroup rescan is not queued"); + ret = -EINVAL; + } else if (!(fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAG_ON)) { + btrfs_warn(fs_info, + "qgroup rescan init failed, qgroup is not enabled"); + ret = -EINVAL; + } + + if (ret) + return ret; + } + + mutex_lock(&fs_info->qgroup_rescan_lock); + + if (init_flags) { + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + btrfs_warn(fs_info, + "qgroup rescan is already in progress"); + ret = -EINPROGRESS; + } else if (!(fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAG_ON)) { + btrfs_warn(fs_info, + "qgroup rescan init failed, qgroup is not enabled"); + ret = -EINVAL; + } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + /* Quota disable is in progress */ + ret = -EBUSY; + } + + if (ret) { + mutex_unlock(&fs_info->qgroup_rescan_lock); + return ret; + } + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; + } + + memset(&fs_info->qgroup_rescan_progress, 0, + sizeof(fs_info->qgroup_rescan_progress)); + fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | + BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); + fs_info->qgroup_rescan_progress.objectid = progress_objectid; + init_completion(&fs_info->qgroup_rescan_completion); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + btrfs_init_work(&fs_info->qgroup_rescan_work, + btrfs_qgroup_rescan_worker, NULL, NULL); + return 0; +} + +static void +qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) +{ + struct rb_node *n; + struct btrfs_qgroup *qgroup; + + spin_lock(&fs_info->qgroup_lock); + /* clear all current qgroup tracking information */ + for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { + qgroup = rb_entry(n, struct btrfs_qgroup, node); + qgroup->rfer = 0; + qgroup->rfer_cmpr = 0; + qgroup->excl = 0; + qgroup->excl_cmpr = 0; + qgroup_dirty(fs_info, qgroup); + } + spin_unlock(&fs_info->qgroup_lock); +} + +int +btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) +{ + int ret = 0; + struct btrfs_trans_handle *trans; + + ret = qgroup_rescan_init(fs_info, 0, 1); + if (ret) + return ret; + + /* + * We have set the rescan_progress to 0, which means no more + * delayed refs will be accounted by btrfs_qgroup_account_ref. + * However, btrfs_qgroup_account_ref may be right after its call + * to btrfs_find_all_roots, in which case it would still do the + * accounting. + * To solve this, we're committing the transaction, which will + * ensure we run all delayed refs and only after that, we are + * going to clear all tracking information for a clean start. + */ + + trans = btrfs_attach_transaction_barrier(fs_info->fs_root); + if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + return PTR_ERR(trans); + } else if (trans != ERR_PTR(-ENOENT)) { + ret = btrfs_commit_transaction(trans); + if (ret) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + return ret; + } + } + + qgroup_rescan_zero_tracking(fs_info); + + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = true; + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + return 0; +} + +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, + bool interruptible) +{ + int running; + int ret = 0; + + mutex_lock(&fs_info->qgroup_rescan_lock); + running = fs_info->qgroup_rescan_running; + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (!running) + return 0; + + if (interruptible) + ret = wait_for_completion_interruptible( + &fs_info->qgroup_rescan_completion); + else + wait_for_completion(&fs_info->qgroup_rescan_completion); + + return ret; +} + +/* + * this is only called from open_ctree where we're still single threaded, thus + * locking is omitted here. + */ +void +btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) +{ + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = true; + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); + mutex_unlock(&fs_info->qgroup_rescan_lock); + } +} + +#define rbtree_iterate_from_safe(node, next, start) \ + for (node = start; node && ({ next = rb_next(node); 1;}); node = next) + +static int qgroup_unreserve_range(struct btrfs_inode *inode, + struct extent_changeset *reserved, u64 start, + u64 len) +{ + struct rb_node *node; + struct rb_node *next; + struct ulist_node *entry; + int ret = 0; + + node = reserved->range_changed.root.rb_node; + if (!node) + return 0; + while (node) { + entry = rb_entry(node, struct ulist_node, rb_node); + if (entry->val < start) + node = node->rb_right; + else + node = node->rb_left; + } + + if (entry->val > start && rb_prev(&entry->rb_node)) + entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, + rb_node); + + rbtree_iterate_from_safe(node, next, &entry->rb_node) { + u64 entry_start; + u64 entry_end; + u64 entry_len; + int clear_ret; + + entry = rb_entry(node, struct ulist_node, rb_node); + entry_start = entry->val; + entry_end = entry->aux; + entry_len = entry_end - entry_start + 1; + + if (entry_start >= start + len) + break; + if (entry_start + entry_len <= start) + continue; + /* + * Now the entry is in [start, start + len), revert the + * EXTENT_QGROUP_RESERVED bit. + */ + clear_ret = clear_extent_bits(&inode->io_tree, entry_start, + entry_end, EXTENT_QGROUP_RESERVED); + if (!ret && clear_ret < 0) + ret = clear_ret; + + ulist_del(&reserved->range_changed, entry->val, entry->aux); + if (likely(reserved->bytes_changed >= entry_len)) { + reserved->bytes_changed -= entry_len; + } else { + WARN_ON(1); + reserved->bytes_changed = 0; + } + } + + return ret; +} + +/* + * Try to free some space for qgroup. + * + * For qgroup, there are only 3 ways to free qgroup space: + * - Flush nodatacow write + * Any nodatacow write will free its reserved data space at run_delalloc_range(). + * In theory, we should only flush nodatacow inodes, but it's not yet + * possible, so we need to flush the whole root. + * + * - Wait for ordered extents + * When ordered extents are finished, their reserved metadata is finally + * converted to per_trans status, which can be freed by later commit + * transaction. + * + * - Commit transaction + * This would free the meta_per_trans space. + * In theory this shouldn't provide much space, but any more qgroup space + * is needed. + */ +static int try_flush_qgroup(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret; + + /* Can't hold an open transaction or we run the risk of deadlocking. */ + ASSERT(current->journal_info == NULL); + if (WARN_ON(current->journal_info)) + return 0; + + /* + * We don't want to run flush again and again, so if there is a running + * one, we won't try to start a new flush, but exit directly. + */ + if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { + wait_event(root->qgroup_flush_wait, + !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); + return 0; + } + + ret = btrfs_start_delalloc_snapshot(root, true); + if (ret < 0) + goto out; + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); + + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; + goto out; + } + + ret = btrfs_commit_transaction(trans); +out: + clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); + wake_up(&root->qgroup_flush_wait); + return ret; +} + +static int qgroup_reserve_data(struct btrfs_inode *inode, + struct extent_changeset **reserved_ret, u64 start, + u64 len) +{ + struct btrfs_root *root = inode->root; + struct extent_changeset *reserved; + bool new_reserved = false; + u64 orig_reserved; + u64 to_reserve; + int ret; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || + !is_fstree(root->root_key.objectid) || len == 0) + return 0; + + /* @reserved parameter is mandatory for qgroup */ + if (WARN_ON(!reserved_ret)) + return -EINVAL; + if (!*reserved_ret) { + new_reserved = true; + *reserved_ret = extent_changeset_alloc(); + if (!*reserved_ret) + return -ENOMEM; + } + reserved = *reserved_ret; + /* Record already reserved space */ + orig_reserved = reserved->bytes_changed; + ret = set_record_extent_bits(&inode->io_tree, start, + start + len -1, EXTENT_QGROUP_RESERVED, reserved); + + /* Newly reserved space */ + to_reserve = reserved->bytes_changed - orig_reserved; + trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, + to_reserve, QGROUP_RESERVE); + if (ret < 0) + goto out; + ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA); + if (ret < 0) + goto cleanup; + + return ret; + +cleanup: + qgroup_unreserve_range(inode, reserved, start, len); +out: + if (new_reserved) { + extent_changeset_free(reserved); + *reserved_ret = NULL; + } + return ret; +} + +/* + * Reserve qgroup space for range [start, start + len). + * + * This function will either reserve space from related qgroups or do nothing + * if the range is already reserved. + * + * Return 0 for successful reservation + * Return <0 for error (including -EQUOT) + * + * NOTE: This function may sleep for memory allocation, dirty page flushing and + * commit transaction. So caller should not hold any dirty page locked. + */ +int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, + struct extent_changeset **reserved_ret, u64 start, + u64 len) +{ + int ret; + + ret = qgroup_reserve_data(inode, reserved_ret, start, len); + if (ret <= 0 && ret != -EDQUOT) + return ret; + + ret = try_flush_qgroup(inode->root); + if (ret < 0) + return ret; + return qgroup_reserve_data(inode, reserved_ret, start, len); +} + +/* Free ranges specified by @reserved, normally in error path */ +static int qgroup_free_reserved_data(struct btrfs_inode *inode, + struct extent_changeset *reserved, + u64 start, u64 len, u64 *freed_ret) +{ + struct btrfs_root *root = inode->root; + struct ulist_node *unode; + struct ulist_iterator uiter; + struct extent_changeset changeset; + u64 freed = 0; + int ret; + + extent_changeset_init(&changeset); + len = round_up(start + len, root->fs_info->sectorsize); + start = round_down(start, root->fs_info->sectorsize); + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(&reserved->range_changed, &uiter))) { + u64 range_start = unode->val; + /* unode->aux is the inclusive end */ + u64 range_len = unode->aux - range_start + 1; + u64 free_start; + u64 free_len; + + extent_changeset_release(&changeset); + + /* Only free range in range [start, start + len) */ + if (range_start >= start + len || + range_start + range_len <= start) + continue; + free_start = max(range_start, start); + free_len = min(start + len, range_start + range_len) - + free_start; + /* + * TODO: To also modify reserved->ranges_reserved to reflect + * the modification. + * + * However as long as we free qgroup reserved according to + * EXTENT_QGROUP_RESERVED, we won't double free. + * So not need to rush. + */ + ret = clear_record_extent_bits(&inode->io_tree, free_start, + free_start + free_len - 1, + EXTENT_QGROUP_RESERVED, &changeset); + if (ret < 0) + goto out; + freed += changeset.bytes_changed; + } + btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, + BTRFS_QGROUP_RSV_DATA); + if (freed_ret) + *freed_ret = freed; + ret = 0; +out: + extent_changeset_release(&changeset); + return ret; +} + +static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, + struct extent_changeset *reserved, u64 start, u64 len, + u64 *released, int free) +{ + struct extent_changeset changeset; + int trace_op = QGROUP_RELEASE; + int ret; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags)) + return 0; + + /* In release case, we shouldn't have @reserved */ + WARN_ON(!free && reserved); + if (free && reserved) + return qgroup_free_reserved_data(inode, reserved, start, len, released); + extent_changeset_init(&changeset); + ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, + EXTENT_QGROUP_RESERVED, &changeset); + if (ret < 0) + goto out; + + if (free) + trace_op = QGROUP_FREE; + trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len, + changeset.bytes_changed, trace_op); + if (free) + btrfs_qgroup_free_refroot(inode->root->fs_info, + inode->root->root_key.objectid, + changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); + if (released) + *released = changeset.bytes_changed; +out: + extent_changeset_release(&changeset); + return ret; +} + +/* + * Free a reserved space range from io_tree and related qgroups + * + * Should be called when a range of pages get invalidated before reaching disk. + * Or for error cleanup case. + * if @reserved is given, only reserved range in [@start, @start + @len) will + * be freed. + * + * For data written to disk, use btrfs_qgroup_release_data(). + * + * NOTE: This function may sleep for memory allocation. + */ +int btrfs_qgroup_free_data(struct btrfs_inode *inode, + struct extent_changeset *reserved, + u64 start, u64 len, u64 *freed) +{ + return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1); +} + +/* + * Release a reserved space range from io_tree only. + * + * Should be called when a range of pages get written to disk and corresponding + * FILE_EXTENT is inserted into corresponding root. + * + * Since new qgroup accounting framework will only update qgroup numbers at + * commit_transaction() time, its reserved space shouldn't be freed from + * related qgroups. + * + * But we should release the range from io_tree, to allow further write to be + * COWed. + * + * NOTE: This function may sleep for memory allocation. + */ +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released) +{ + return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0); +} + +static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type) +{ + if (type != BTRFS_QGROUP_RSV_META_PREALLOC && + type != BTRFS_QGROUP_RSV_META_PERTRANS) + return; + if (num_bytes == 0) + return; + + spin_lock(&root->qgroup_meta_rsv_lock); + if (type == BTRFS_QGROUP_RSV_META_PREALLOC) + root->qgroup_meta_rsv_prealloc += num_bytes; + else + root->qgroup_meta_rsv_pertrans += num_bytes; + spin_unlock(&root->qgroup_meta_rsv_lock); +} + +static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type) +{ + if (type != BTRFS_QGROUP_RSV_META_PREALLOC && + type != BTRFS_QGROUP_RSV_META_PERTRANS) + return 0; + if (num_bytes == 0) + return 0; + + spin_lock(&root->qgroup_meta_rsv_lock); + if (type == BTRFS_QGROUP_RSV_META_PREALLOC) { + num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc, + num_bytes); + root->qgroup_meta_rsv_prealloc -= num_bytes; + } else { + num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans, + num_bytes); + root->qgroup_meta_rsv_pertrans -= num_bytes; + } + spin_unlock(&root->qgroup_meta_rsv_lock); + return num_bytes; +} + +int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + !is_fstree(root->root_key.objectid) || num_bytes == 0) + return 0; + + BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); + trace_qgroup_meta_reserve(root, (s64)num_bytes, type); + ret = qgroup_reserve(root, num_bytes, enforce, type); + if (ret < 0) + return ret; + /* + * Record what we have reserved into root. + * + * To avoid quota disabled->enabled underflow. + * In that case, we may try to free space we haven't reserved + * (since quota was disabled), so record what we reserved into root. + * And ensure later release won't underflow this number. + */ + add_root_meta_rsv(root, num_bytes, type); + return ret; +} + +int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce, + bool noflush) +{ + int ret; + + ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); + if ((ret <= 0 && ret != -EDQUOT) || noflush) + return ret; + + ret = try_flush_qgroup(root); + if (ret < 0) + return ret; + return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); +} + +void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + !is_fstree(root->root_key.objectid)) + return; + + /* TODO: Update trace point to handle such free */ + trace_qgroup_meta_free_all_pertrans(root); + /* Special value -1 means to free all reserved space */ + btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1, + BTRFS_QGROUP_RSV_META_PERTRANS); +} + +void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + !is_fstree(root->root_key.objectid)) + return; + + /* + * reservation for META_PREALLOC can happen before quota is enabled, + * which can lead to underflow. + * Here ensure we will only free what we really have reserved. + */ + num_bytes = sub_root_meta_rsv(root, num_bytes, type); + BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); + trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); + btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, + num_bytes, type); +} + +static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, + int num_bytes) +{ + struct btrfs_qgroup *qgroup; + LIST_HEAD(qgroup_list); + + if (num_bytes == 0) + return; + if (!fs_info->quota_root) + return; + + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(qgroup, &qgroup_list, iterator) { + struct btrfs_qgroup_list *glist; + + qgroup_rsv_release(fs_info, qgroup, num_bytes, + BTRFS_QGROUP_RSV_META_PREALLOC); + if (!sb_rdonly(fs_info->sb)) + qgroup_rsv_add(fs_info, qgroup, num_bytes, + BTRFS_QGROUP_RSV_META_PERTRANS); + + list_for_each_entry(glist, &qgroup->groups, next_group) + qgroup_iterator_add(&qgroup_list, glist->group); + } +out: + qgroup_iterator_clean(&qgroup_list); + spin_unlock(&fs_info->qgroup_lock); +} + +void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + !is_fstree(root->root_key.objectid)) + return; + /* Same as btrfs_qgroup_free_meta_prealloc() */ + num_bytes = sub_root_meta_rsv(root, num_bytes, + BTRFS_QGROUP_RSV_META_PREALLOC); + trace_qgroup_meta_convert(root, num_bytes); + qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); +} + +/* + * Check qgroup reserved space leaking, normally at destroy inode + * time + */ +void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) +{ + struct extent_changeset changeset; + struct ulist_node *unode; + struct ulist_iterator iter; + int ret; + + extent_changeset_init(&changeset); + ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, + EXTENT_QGROUP_RESERVED, &changeset); + + WARN_ON(ret < 0); + if (WARN_ON(changeset.bytes_changed)) { + ULIST_ITER_INIT(&iter); + while ((unode = ulist_next(&changeset.range_changed, &iter))) { + btrfs_warn(inode->root->fs_info, + "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu", + btrfs_ino(inode), unode->val, unode->aux); + } + btrfs_qgroup_free_refroot(inode->root->fs_info, + inode->root->root_key.objectid, + changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); + + } + extent_changeset_release(&changeset); +} + +void btrfs_qgroup_init_swapped_blocks( + struct btrfs_qgroup_swapped_blocks *swapped_blocks) +{ + int i; + + spin_lock_init(&swapped_blocks->lock); + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + swapped_blocks->blocks[i] = RB_ROOT; + swapped_blocks->swapped = false; +} + +/* + * Delete all swapped blocks record of @root. + * Every record here means we skipped a full subtree scan for qgroup. + * + * Gets called when committing one transaction. + */ +void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root) +{ + struct btrfs_qgroup_swapped_blocks *swapped_blocks; + int i; + + swapped_blocks = &root->swapped_blocks; + + spin_lock(&swapped_blocks->lock); + if (!swapped_blocks->swapped) + goto out; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + struct rb_root *cur_root = &swapped_blocks->blocks[i]; + struct btrfs_qgroup_swapped_block *entry; + struct btrfs_qgroup_swapped_block *next; + + rbtree_postorder_for_each_entry_safe(entry, next, cur_root, + node) + kfree(entry); + swapped_blocks->blocks[i] = RB_ROOT; + } + swapped_blocks->swapped = false; +out: + spin_unlock(&swapped_blocks->lock); +} + +/* + * Add subtree roots record into @subvol_root. + * + * @subvol_root: tree root of the subvolume tree get swapped + * @bg: block group under balance + * @subvol_parent/slot: pointer to the subtree root in subvolume tree + * @reloc_parent/slot: pointer to the subtree root in reloc tree + * BOTH POINTERS ARE BEFORE TREE SWAP + * @last_snapshot: last snapshot generation of the subvolume tree + */ +int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *subvol_root, + struct btrfs_block_group *bg, + struct extent_buffer *subvol_parent, int subvol_slot, + struct extent_buffer *reloc_parent, int reloc_slot, + u64 last_snapshot) +{ + struct btrfs_fs_info *fs_info = subvol_root->fs_info; + struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; + struct btrfs_qgroup_swapped_block *block; + struct rb_node **cur; + struct rb_node *parent = NULL; + int level = btrfs_header_level(subvol_parent) - 1; + int ret = 0; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + + if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > + btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { + btrfs_err_rl(fs_info, + "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", + __func__, + btrfs_node_ptr_generation(subvol_parent, subvol_slot), + btrfs_node_ptr_generation(reloc_parent, reloc_slot)); + return -EUCLEAN; + } + + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) { + ret = -ENOMEM; + goto out; + } + + /* + * @reloc_parent/slot is still before swap, while @block is going to + * record the bytenr after swap, so we do the swap here. + */ + block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot); + block->subvol_generation = btrfs_node_ptr_generation(reloc_parent, + reloc_slot); + block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot); + block->reloc_generation = btrfs_node_ptr_generation(subvol_parent, + subvol_slot); + block->last_snapshot = last_snapshot; + block->level = level; + + /* + * If we have bg == NULL, we're called from btrfs_recover_relocation(), + * no one else can modify tree blocks thus we qgroup will not change + * no matter the value of trace_leaf. + */ + if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA) + block->trace_leaf = true; + else + block->trace_leaf = false; + btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot); + + /* Insert @block into @blocks */ + spin_lock(&blocks->lock); + cur = &blocks->blocks[level].rb_node; + while (*cur) { + struct btrfs_qgroup_swapped_block *entry; + + parent = *cur; + entry = rb_entry(parent, struct btrfs_qgroup_swapped_block, + node); + + if (entry->subvol_bytenr < block->subvol_bytenr) { + cur = &(*cur)->rb_left; + } else if (entry->subvol_bytenr > block->subvol_bytenr) { + cur = &(*cur)->rb_right; + } else { + if (entry->subvol_generation != + block->subvol_generation || + entry->reloc_bytenr != block->reloc_bytenr || + entry->reloc_generation != + block->reloc_generation) { + /* + * Duplicated but mismatch entry found. + * Shouldn't happen. + * + * Marking qgroup inconsistent should be enough + * for end users. + */ + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + ret = -EEXIST; + } + kfree(block); + goto out_unlock; + } + } + rb_link_node(&block->node, parent, cur); + rb_insert_color(&block->node, &blocks->blocks[level]); + blocks->swapped = true; +out_unlock: + spin_unlock(&blocks->lock); +out: + if (ret < 0) + qgroup_mark_inconsistent(fs_info); + return ret; +} + +/* + * Check if the tree block is a subtree root, and if so do the needed + * delayed subtree trace for qgroup. + * + * This is called during btrfs_cow_block(). + */ +int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *subvol_eb) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_tree_parent_check check = { 0 }; + struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; + struct btrfs_qgroup_swapped_block *block; + struct extent_buffer *reloc_eb = NULL; + struct rb_node *node; + bool found = false; + bool swapped = false; + int level = btrfs_header_level(subvol_eb); + int ret = 0; + int i; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + if (!is_fstree(root->root_key.objectid) || !root->reloc_root) + return 0; + + spin_lock(&blocks->lock); + if (!blocks->swapped) { + spin_unlock(&blocks->lock); + return 0; + } + node = blocks->blocks[level].rb_node; + + while (node) { + block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); + if (block->subvol_bytenr < subvol_eb->start) { + node = node->rb_left; + } else if (block->subvol_bytenr > subvol_eb->start) { + node = node->rb_right; + } else { + found = true; + break; + } + } + if (!found) { + spin_unlock(&blocks->lock); + goto out; + } + /* Found one, remove it from @blocks first and update blocks->swapped */ + rb_erase(&block->node, &blocks->blocks[level]); + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (RB_EMPTY_ROOT(&blocks->blocks[i])) { + swapped = true; + break; + } + } + blocks->swapped = swapped; + spin_unlock(&blocks->lock); + + check.level = block->level; + check.transid = block->reloc_generation; + check.has_first_key = true; + memcpy(&check.first_key, &block->first_key, sizeof(check.first_key)); + + /* Read out reloc subtree root */ + reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check); + if (IS_ERR(reloc_eb)) { + ret = PTR_ERR(reloc_eb); + reloc_eb = NULL; + goto free_out; + } + if (!extent_buffer_uptodate(reloc_eb)) { + ret = -EIO; + goto free_out; + } + + ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, + block->last_snapshot, block->trace_leaf); +free_out: + kfree(block); + free_extent_buffer(reloc_eb); +out: + if (ret < 0) { + btrfs_err_rl(fs_info, + "failed to account subtree at bytenr %llu: %d", + subvol_eb->start, ret); + qgroup_mark_inconsistent(fs_info); + } + return ret; +} + +void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) +{ + struct btrfs_qgroup_extent_record *entry; + struct btrfs_qgroup_extent_record *next; + struct rb_root *root; + + root = &trans->delayed_refs.dirty_extent_root; + rbtree_postorder_for_each_entry_safe(entry, next, root, node) { + ulist_free(entry->old_roots); + kfree(entry); + } + *root = RB_ROOT; +} diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h new file mode 100644 index 0000000000..1203f06320 --- /dev/null +++ b/fs/btrfs/qgroup.h @@ -0,0 +1,452 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2014 Facebook. All rights reserved. + */ + +#ifndef BTRFS_QGROUP_H +#define BTRFS_QGROUP_H + +#include +#include +#include +#include "ulist.h" +#include "delayed-ref.h" +#include "misc.h" + +/* + * Btrfs qgroup overview + * + * Btrfs qgroup splits into 3 main part: + * 1) Reserve + * Reserve metadata/data space for incoming operations + * Affect how qgroup limit works + * + * 2) Trace + * Tell btrfs qgroup to trace dirty extents. + * + * Dirty extents including: + * - Newly allocated extents + * - Extents going to be deleted (in this trans) + * - Extents whose owner is going to be modified + * + * This is the main part affects whether qgroup numbers will stay + * consistent. + * Btrfs qgroup can trace clean extents and won't cause any problem, + * but it will consume extra CPU time, it should be avoided if possible. + * + * 3) Account + * Btrfs qgroup will updates its numbers, based on dirty extents traced + * in previous step. + * + * Normally at qgroup rescan and transaction commit time. + */ + +/* + * Special performance optimization for balance. + * + * For balance, we need to swap subtree of subvolume and reloc trees. + * In theory, we need to trace all subtree blocks of both subvolume and reloc + * trees, since their owner has changed during such swap. + * + * However since balance has ensured that both subtrees are containing the + * same contents and have the same tree structures, such swap won't cause + * qgroup number change. + * + * But there is a race window between subtree swap and transaction commit, + * during that window, if we increase/decrease tree level or merge/split tree + * blocks, we still need to trace the original subtrees. + * + * So for balance, we use a delayed subtree tracing, whose workflow is: + * + * 1) Record the subtree root block get swapped. + * + * During subtree swap: + * O = Old tree blocks + * N = New tree blocks + * reloc tree subvolume tree X + * Root Root + * / \ / \ + * NA OB OA OB + * / | | \ / | | \ + * NC ND OE OF OC OD OE OF + * + * In this case, NA and OA are going to be swapped, record (NA, OA) into + * subvolume tree X. + * + * 2) After subtree swap. + * reloc tree subvolume tree X + * Root Root + * / \ / \ + * OA OB NA OB + * / | | \ / | | \ + * OC OD OE OF NC ND OE OF + * + * 3a) COW happens for OB + * If we are going to COW tree block OB, we check OB's bytenr against + * tree X's swapped_blocks structure. + * If it doesn't fit any, nothing will happen. + * + * 3b) COW happens for NA + * Check NA's bytenr against tree X's swapped_blocks, and get a hit. + * Then we do subtree scan on both subtrees OA and NA. + * Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND). + * + * Then no matter what we do to subvolume tree X, qgroup numbers will + * still be correct. + * Then NA's record gets removed from X's swapped_blocks. + * + * 4) Transaction commit + * Any record in X's swapped_blocks gets removed, since there is no + * modification to the swapped subtrees, no need to trigger heavy qgroup + * subtree rescan for them. + */ + +#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3) +#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4) + +/* + * Record a dirty extent, and info qgroup to update quota on it + * TODO: Use kmem cache to alloc it. + */ +struct btrfs_qgroup_extent_record { + struct rb_node node; + u64 bytenr; + u64 num_bytes; + + /* + * For qgroup reserved data space freeing. + * + * @data_rsv_refroot and @data_rsv will be recorded after + * BTRFS_ADD_DELAYED_EXTENT is called. + * And will be used to free reserved qgroup space at + * transaction commit time. + */ + u32 data_rsv; /* reserved data space needs to be freed */ + u64 data_rsv_refroot; /* which root the reserved data belongs to */ + struct ulist *old_roots; +}; + +struct btrfs_qgroup_swapped_block { + struct rb_node node; + + int level; + bool trace_leaf; + + /* bytenr/generation of the tree block in subvolume tree after swap */ + u64 subvol_bytenr; + u64 subvol_generation; + + /* bytenr/generation of the tree block in reloc tree after swap */ + u64 reloc_bytenr; + u64 reloc_generation; + + u64 last_snapshot; + struct btrfs_key first_key; +}; + +/* + * Qgroup reservation types: + * + * DATA: + * space reserved for data + * + * META_PERTRANS: + * Space reserved for metadata (per-transaction) + * Due to the fact that qgroup data is only updated at transaction commit + * time, reserved space for metadata must be kept until transaction + * commits. + * Any metadata reserved that are used in btrfs_start_transaction() should + * be of this type. + * + * META_PREALLOC: + * There are cases where metadata space is reserved before starting + * transaction, and then btrfs_join_transaction() to get a trans handle. + * Any metadata reserved for such usage should be of this type. + * And after join_transaction() part (or all) of such reservation should + * be converted into META_PERTRANS. + */ +enum btrfs_qgroup_rsv_type { + BTRFS_QGROUP_RSV_DATA, + BTRFS_QGROUP_RSV_META_PERTRANS, + BTRFS_QGROUP_RSV_META_PREALLOC, + BTRFS_QGROUP_RSV_LAST, +}; + +/* + * Represents how many bytes we have reserved for this qgroup. + * + * Each type should have different reservation behavior. + * E.g, data follows its io_tree flag modification, while + * *currently* meta is just reserve-and-clear during transaction. + * + * TODO: Add new type for reservation which can survive transaction commit. + * Current metadata reservation behavior is not suitable for such case. + */ +struct btrfs_qgroup_rsv { + u64 values[BTRFS_QGROUP_RSV_LAST]; +}; + +/* + * one struct for each qgroup, organized in fs_info->qgroup_tree. + */ +struct btrfs_qgroup { + u64 qgroupid; + + /* + * state + */ + u64 rfer; /* referenced */ + u64 rfer_cmpr; /* referenced compressed */ + u64 excl; /* exclusive */ + u64 excl_cmpr; /* exclusive compressed */ + + /* + * limits + */ + u64 lim_flags; /* which limits are set */ + u64 max_rfer; + u64 max_excl; + u64 rsv_rfer; + u64 rsv_excl; + + /* + * reservation tracking + */ + struct btrfs_qgroup_rsv rsv; + + /* + * lists + */ + struct list_head groups; /* groups this group is member of */ + struct list_head members; /* groups that are members of this group */ + struct list_head dirty; /* dirty groups */ + + /* + * For qgroup iteration usage. + * + * The iteration list should always be empty until qgroup_iterator_add() + * is called. And should be reset to empty after the iteration is + * finished. + */ + struct list_head iterator; + struct rb_node node; /* tree of qgroups */ + + /* + * temp variables for accounting operations + * Refer to qgroup_shared_accounting() for details. + */ + u64 old_refcnt; + u64 new_refcnt; + + /* + * Sysfs kobjectid + */ + struct kobject kobj; +}; + +static inline u64 btrfs_qgroup_subvolid(u64 qgroupid) +{ + return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1)); +} + +/* + * For qgroup event trace points only + */ +enum { + ENUM_BIT(QGROUP_RESERVE), + ENUM_BIT(QGROUP_RELEASE), + ENUM_BIT(QGROUP_FREE), +}; + +int btrfs_quota_enable(struct btrfs_fs_info *fs_info); +int btrfs_quota_disable(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); +void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, + bool interruptible); +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, + u64 dst); +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, + u64 dst); +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid); +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid); +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, + struct btrfs_qgroup_limit *limit); +int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); +void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); +struct btrfs_delayed_extent_op; + +/* + * Inform qgroup to trace one dirty extent, its info is recorded in @record. + * So qgroup can account it at transaction committing time. + * + * No lock version, caller must acquire delayed ref lock and allocated memory, + * then call btrfs_qgroup_trace_extent_post() after exiting lock context. + * + * Return 0 for success insert + * Return >0 for existing record, caller can free @record safely. + * Error is not possible + */ +int btrfs_qgroup_trace_extent_nolock( + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record); + +/* + * Post handler after qgroup_trace_extent_nolock(). + * + * NOTE: Current qgroup does the expensive backref walk at transaction + * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming + * new transaction. + * This is designed to allow btrfs_find_all_roots() to get correct new_roots + * result. + * + * However for old_roots there is no need to do backref walk at that time, + * since we search commit roots to walk backref and result will always be + * correct. + * + * Due to the nature of no lock version, we can't do backref there. + * So we must call btrfs_qgroup_trace_extent_post() after exiting + * spinlock context. + * + * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result + * using current root, then we can move all expensive backref walk out of + * transaction committing, but not now as qgroup accounting will be wrong again. + */ +int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, + struct btrfs_qgroup_extent_record *qrecord); + +/* + * Inform qgroup to trace one dirty extent, specified by @bytenr and + * @num_bytes. + * So qgroup can account it at commit trans time. + * + * Better encapsulated version, with memory allocation and backref walk for + * commit roots. + * So this can sleep. + * + * Return 0 if the operation is done. + * Return <0 for error, like memory allocation failure or invalid parameter + * (NULL trans) + */ +int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, + u64 num_bytes); + +/* + * Inform qgroup to trace all leaf items of data + * + * Return 0 for success + * Return <0 for error(ENOMEM) + */ +int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, + struct extent_buffer *eb); +/* + * Inform qgroup to trace a whole subtree, including all its child tree + * blocks and data. + * The root tree block is specified by @root_eb. + * + * Normally used by relocation(tree block swap) and subvolume deletion. + * + * Return 0 for success + * Return <0 for error(ENOMEM or tree search error) + */ +int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, + struct extent_buffer *root_eb, + u64 root_gen, int root_level); +int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, + u64 num_bytes, struct ulist *old_roots, + struct ulist *new_roots); +int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans); +int btrfs_run_qgroups(struct btrfs_trans_handle *trans); +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, + u64 objectid, struct btrfs_qgroup_inherit *inherit); +void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, + u64 ref_root, u64 num_bytes, + enum btrfs_qgroup_rsv_type type); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl); +#endif + +/* New io_tree based accurate qgroup reserve API */ +int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released); +int btrfs_qgroup_free_data(struct btrfs_inode *inode, + struct extent_changeset *reserved, u64 start, + u64 len, u64 *freed); +int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce); +int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce, + bool noflush); +/* Reserve metadata space for pertrans and prealloc type */ +static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root, + int num_bytes, bool enforce) +{ + return __btrfs_qgroup_reserve_meta(root, num_bytes, + BTRFS_QGROUP_RSV_META_PERTRANS, + enforce, false); +} +static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root, + int num_bytes, bool enforce, + bool noflush) +{ + return __btrfs_qgroup_reserve_meta(root, num_bytes, + BTRFS_QGROUP_RSV_META_PREALLOC, + enforce, noflush); +} + +void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type); + +/* Free per-transaction meta reservation for error handling */ +static inline void btrfs_qgroup_free_meta_pertrans(struct btrfs_root *root, + int num_bytes) +{ + __btrfs_qgroup_free_meta(root, num_bytes, + BTRFS_QGROUP_RSV_META_PERTRANS); +} + +/* Pre-allocated meta reservation can be freed at need */ +static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root, + int num_bytes) +{ + __btrfs_qgroup_free_meta(root, num_bytes, + BTRFS_QGROUP_RSV_META_PREALLOC); +} + +/* + * Per-transaction meta reservation should be all freed at transaction commit + * time + */ +void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root); + +/* + * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS. + * + * This is called when preallocated meta reservation needs to be used. + * Normally after btrfs_join_transaction() call. + */ +void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes); + +void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode); + +/* btrfs_qgroup_swapped_blocks related functions */ +void btrfs_qgroup_init_swapped_blocks( + struct btrfs_qgroup_swapped_blocks *swapped_blocks); + +void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root); +int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *subvol_root, + struct btrfs_block_group *bg, + struct extent_buffer *subvol_parent, int subvol_slot, + struct extent_buffer *reloc_parent, int reloc_slot, + u64 last_snapshot); +int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *eb); +void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); +bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info); + +#endif diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c new file mode 100644 index 0000000000..3e014b9370 --- /dev/null +++ b/fs/btrfs/raid56.c @@ -0,0 +1,2782 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2012 Fusion-io All rights reserved. + * Copyright (C) 2012 Intel Corp. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "messages.h" +#include "misc.h" +#include "ctree.h" +#include "disk-io.h" +#include "volumes.h" +#include "raid56.h" +#include "async-thread.h" +#include "file-item.h" +#include "btrfs_inode.h" + +/* set when additional merges to this rbio are not allowed */ +#define RBIO_RMW_LOCKED_BIT 1 + +/* + * set when this rbio is sitting in the hash, but it is just a cache + * of past RMW + */ +#define RBIO_CACHE_BIT 2 + +/* + * set when it is safe to trust the stripe_pages for caching + */ +#define RBIO_CACHE_READY_BIT 3 + +#define RBIO_CACHE_SIZE 1024 + +#define BTRFS_STRIPE_HASH_TABLE_BITS 11 + +/* Used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash { + struct list_head hash_list; + spinlock_t lock; +}; + +/* Used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash_table { + struct list_head stripe_cache; + spinlock_t cache_lock; + int cache_size; + struct btrfs_stripe_hash table[]; +}; + +/* + * A bvec like structure to present a sector inside a page. + * + * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. + */ +struct sector_ptr { + struct page *page; + unsigned int pgoff:24; + unsigned int uptodate:8; +}; + +static void rmw_rbio_work(struct work_struct *work); +static void rmw_rbio_work_locked(struct work_struct *work); +static void index_rbio_pages(struct btrfs_raid_bio *rbio); +static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); + +static int finish_parity_scrub(struct btrfs_raid_bio *rbio); +static void scrub_rbio_work_locked(struct work_struct *work); + +static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) +{ + bitmap_free(rbio->error_bitmap); + kfree(rbio->stripe_pages); + kfree(rbio->bio_sectors); + kfree(rbio->stripe_sectors); + kfree(rbio->finish_pointers); +} + +static void free_raid_bio(struct btrfs_raid_bio *rbio) +{ + int i; + + if (!refcount_dec_and_test(&rbio->refs)) + return; + + WARN_ON(!list_empty(&rbio->stripe_cache)); + WARN_ON(!list_empty(&rbio->hash_list)); + WARN_ON(!bio_list_empty(&rbio->bio_list)); + + for (i = 0; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) { + __free_page(rbio->stripe_pages[i]); + rbio->stripe_pages[i] = NULL; + } + } + + btrfs_put_bioc(rbio->bioc); + free_raid_bio_pointers(rbio); + kfree(rbio); +} + +static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) +{ + INIT_WORK(&rbio->work, work_func); + queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); +} + +/* + * the stripe hash table is used for locking, and to collect + * bios in hopes of making a full stripe + */ +int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) +{ + struct btrfs_stripe_hash_table *table; + struct btrfs_stripe_hash_table *x; + struct btrfs_stripe_hash *cur; + struct btrfs_stripe_hash *h; + int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; + int i; + + if (info->stripe_hash_table) + return 0; + + /* + * The table is large, starting with order 4 and can go as high as + * order 7 in case lock debugging is turned on. + * + * Try harder to allocate and fallback to vmalloc to lower the chance + * of a failing mount. + */ + table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); + if (!table) + return -ENOMEM; + + spin_lock_init(&table->cache_lock); + INIT_LIST_HEAD(&table->stripe_cache); + + h = table->table; + + for (i = 0; i < num_entries; i++) { + cur = h + i; + INIT_LIST_HEAD(&cur->hash_list); + spin_lock_init(&cur->lock); + } + + x = cmpxchg(&info->stripe_hash_table, NULL, table); + kvfree(x); + return 0; +} + +/* + * caching an rbio means to copy anything from the + * bio_sectors array into the stripe_pages array. We + * use the page uptodate bit in the stripe cache array + * to indicate if it has valid data + * + * once the caching is done, we set the cache ready + * bit. + */ +static void cache_rbio_pages(struct btrfs_raid_bio *rbio) +{ + int i; + int ret; + + ret = alloc_rbio_pages(rbio); + if (ret) + return; + + for (i = 0; i < rbio->nr_sectors; i++) { + /* Some range not covered by bio (partial write), skip it */ + if (!rbio->bio_sectors[i].page) { + /* + * Even if the sector is not covered by bio, if it is + * a data sector it should still be uptodate as it is + * read from disk. + */ + if (i < rbio->nr_data * rbio->stripe_nsectors) + ASSERT(rbio->stripe_sectors[i].uptodate); + continue; + } + + ASSERT(rbio->stripe_sectors[i].page); + memcpy_page(rbio->stripe_sectors[i].page, + rbio->stripe_sectors[i].pgoff, + rbio->bio_sectors[i].page, + rbio->bio_sectors[i].pgoff, + rbio->bioc->fs_info->sectorsize); + rbio->stripe_sectors[i].uptodate = 1; + } + set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); +} + +/* + * we hash on the first logical address of the stripe + */ +static int rbio_bucket(struct btrfs_raid_bio *rbio) +{ + u64 num = rbio->bioc->full_stripe_logical; + + /* + * we shift down quite a bit. We're using byte + * addressing, and most of the lower bits are zeros. + * This tends to upset hash_64, and it consistently + * returns just one or two different values. + * + * shifting off the lower bits fixes things. + */ + return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); +} + +static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, + unsigned int page_nr) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 sectors_per_page = PAGE_SIZE / sectorsize; + int i; + + ASSERT(page_nr < rbio->nr_pages); + + for (i = sectors_per_page * page_nr; + i < sectors_per_page * page_nr + sectors_per_page; + i++) { + if (!rbio->stripe_sectors[i].uptodate) + return false; + } + return true; +} + +/* + * Update the stripe_sectors[] array to use correct page and pgoff + * + * Should be called every time any page pointer in stripes_pages[] got modified. + */ +static void index_stripe_sectors(struct btrfs_raid_bio *rbio) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + u32 offset; + int i; + + for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { + int page_index = offset >> PAGE_SHIFT; + + ASSERT(page_index < rbio->nr_pages); + rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; + rbio->stripe_sectors[i].pgoff = offset_in_page(offset); + } +} + +static void steal_rbio_page(struct btrfs_raid_bio *src, + struct btrfs_raid_bio *dest, int page_nr) +{ + const u32 sectorsize = src->bioc->fs_info->sectorsize; + const u32 sectors_per_page = PAGE_SIZE / sectorsize; + int i; + + if (dest->stripe_pages[page_nr]) + __free_page(dest->stripe_pages[page_nr]); + dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; + src->stripe_pages[page_nr] = NULL; + + /* Also update the sector->uptodate bits. */ + for (i = sectors_per_page * page_nr; + i < sectors_per_page * page_nr + sectors_per_page; i++) + dest->stripe_sectors[i].uptodate = true; +} + +static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) +{ + const int sector_nr = (page_nr << PAGE_SHIFT) >> + rbio->bioc->fs_info->sectorsize_bits; + + /* + * We have ensured PAGE_SIZE is aligned with sectorsize, thus + * we won't have a page which is half data half parity. + * + * Thus if the first sector of the page belongs to data stripes, then + * the full page belongs to data stripes. + */ + return (sector_nr < rbio->nr_data * rbio->stripe_nsectors); +} + +/* + * Stealing an rbio means taking all the uptodate pages from the stripe array + * in the source rbio and putting them into the destination rbio. + * + * This will also update the involved stripe_sectors[] which are referring to + * the old pages. + */ +static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) +{ + int i; + + if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) + return; + + for (i = 0; i < dest->nr_pages; i++) { + struct page *p = src->stripe_pages[i]; + + /* + * We don't need to steal P/Q pages as they will always be + * regenerated for RMW or full write anyway. + */ + if (!is_data_stripe_page(src, i)) + continue; + + /* + * If @src already has RBIO_CACHE_READY_BIT, it should have + * all data stripe pages present and uptodate. + */ + ASSERT(p); + ASSERT(full_page_sectors_uptodate(src, i)); + steal_rbio_page(src, dest, i); + } + index_stripe_sectors(dest); + index_stripe_sectors(src); +} + +/* + * merging means we take the bio_list from the victim and + * splice it into the destination. The victim should + * be discarded afterwards. + * + * must be called with dest->rbio_list_lock held + */ +static void merge_rbio(struct btrfs_raid_bio *dest, + struct btrfs_raid_bio *victim) +{ + bio_list_merge(&dest->bio_list, &victim->bio_list); + dest->bio_list_bytes += victim->bio_list_bytes; + /* Also inherit the bitmaps from @victim. */ + bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, + dest->stripe_nsectors); + bio_list_init(&victim->bio_list); +} + +/* + * used to prune items that are in the cache. The caller + * must hold the hash table lock. + */ +static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ + int bucket = rbio_bucket(rbio); + struct btrfs_stripe_hash_table *table; + struct btrfs_stripe_hash *h; + int freeit = 0; + + /* + * check the bit again under the hash table lock. + */ + if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) + return; + + table = rbio->bioc->fs_info->stripe_hash_table; + h = table->table + bucket; + + /* hold the lock for the bucket because we may be + * removing it from the hash table + */ + spin_lock(&h->lock); + + /* + * hold the lock for the bio list because we need + * to make sure the bio list is empty + */ + spin_lock(&rbio->bio_list_lock); + + if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { + list_del_init(&rbio->stripe_cache); + table->cache_size -= 1; + freeit = 1; + + /* if the bio list isn't empty, this rbio is + * still involved in an IO. We take it out + * of the cache list, and drop the ref that + * was held for the list. + * + * If the bio_list was empty, we also remove + * the rbio from the hash_table, and drop + * the corresponding ref + */ + if (bio_list_empty(&rbio->bio_list)) { + if (!list_empty(&rbio->hash_list)) { + list_del_init(&rbio->hash_list); + refcount_dec(&rbio->refs); + BUG_ON(!list_empty(&rbio->plug_list)); + } + } + } + + spin_unlock(&rbio->bio_list_lock); + spin_unlock(&h->lock); + + if (freeit) + free_raid_bio(rbio); +} + +/* + * prune a given rbio from the cache + */ +static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ + struct btrfs_stripe_hash_table *table; + + if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) + return; + + table = rbio->bioc->fs_info->stripe_hash_table; + + spin_lock(&table->cache_lock); + __remove_rbio_from_cache(rbio); + spin_unlock(&table->cache_lock); +} + +/* + * remove everything in the cache + */ +static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) +{ + struct btrfs_stripe_hash_table *table; + struct btrfs_raid_bio *rbio; + + table = info->stripe_hash_table; + + spin_lock(&table->cache_lock); + while (!list_empty(&table->stripe_cache)) { + rbio = list_entry(table->stripe_cache.next, + struct btrfs_raid_bio, + stripe_cache); + __remove_rbio_from_cache(rbio); + } + spin_unlock(&table->cache_lock); +} + +/* + * remove all cached entries and free the hash table + * used by unmount + */ +void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) +{ + if (!info->stripe_hash_table) + return; + btrfs_clear_rbio_cache(info); + kvfree(info->stripe_hash_table); + info->stripe_hash_table = NULL; +} + +/* + * insert an rbio into the stripe cache. It + * must have already been prepared by calling + * cache_rbio_pages + * + * If this rbio was already cached, it gets + * moved to the front of the lru. + * + * If the size of the rbio cache is too big, we + * prune an item. + */ +static void cache_rbio(struct btrfs_raid_bio *rbio) +{ + struct btrfs_stripe_hash_table *table; + + if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) + return; + + table = rbio->bioc->fs_info->stripe_hash_table; + + spin_lock(&table->cache_lock); + spin_lock(&rbio->bio_list_lock); + + /* bump our ref if we were not in the list before */ + if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) + refcount_inc(&rbio->refs); + + if (!list_empty(&rbio->stripe_cache)){ + list_move(&rbio->stripe_cache, &table->stripe_cache); + } else { + list_add(&rbio->stripe_cache, &table->stripe_cache); + table->cache_size += 1; + } + + spin_unlock(&rbio->bio_list_lock); + + if (table->cache_size > RBIO_CACHE_SIZE) { + struct btrfs_raid_bio *found; + + found = list_entry(table->stripe_cache.prev, + struct btrfs_raid_bio, + stripe_cache); + + if (found != rbio) + __remove_rbio_from_cache(found); + } + + spin_unlock(&table->cache_lock); +} + +/* + * helper function to run the xor_blocks api. It is only + * able to do MAX_XOR_BLOCKS at a time, so we need to + * loop through. + */ +static void run_xor(void **pages, int src_cnt, ssize_t len) +{ + int src_off = 0; + int xor_src_cnt = 0; + void *dest = pages[src_cnt]; + + while(src_cnt > 0) { + xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); + xor_blocks(xor_src_cnt, len, dest, pages + src_off); + + src_cnt -= xor_src_cnt; + src_off += xor_src_cnt; + } +} + +/* + * Returns true if the bio list inside this rbio covers an entire stripe (no + * rmw required). + */ +static int rbio_is_full(struct btrfs_raid_bio *rbio) +{ + unsigned long size = rbio->bio_list_bytes; + int ret = 1; + + spin_lock(&rbio->bio_list_lock); + if (size != rbio->nr_data * BTRFS_STRIPE_LEN) + ret = 0; + BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); + spin_unlock(&rbio->bio_list_lock); + + return ret; +} + +/* + * returns 1 if it is safe to merge two rbios together. + * The merging is safe if the two rbios correspond to + * the same stripe and if they are both going in the same + * direction (read vs write), and if neither one is + * locked for final IO + * + * The caller is responsible for locking such that + * rmw_locked is safe to test + */ +static int rbio_can_merge(struct btrfs_raid_bio *last, + struct btrfs_raid_bio *cur) +{ + if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || + test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) + return 0; + + /* + * we can't merge with cached rbios, since the + * idea is that when we merge the destination + * rbio is going to run our IO for us. We can + * steal from cached rbios though, other functions + * handle that. + */ + if (test_bit(RBIO_CACHE_BIT, &last->flags) || + test_bit(RBIO_CACHE_BIT, &cur->flags)) + return 0; + + if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical) + return 0; + + /* we can't merge with different operations */ + if (last->operation != cur->operation) + return 0; + /* + * We've need read the full stripe from the drive. + * check and repair the parity and write the new results. + * + * We're not allowed to add any new bios to the + * bio list here, anyone else that wants to + * change this stripe needs to do their own rmw. + */ + if (last->operation == BTRFS_RBIO_PARITY_SCRUB) + return 0; + + if (last->operation == BTRFS_RBIO_READ_REBUILD) + return 0; + + return 1; +} + +static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr) +{ + ASSERT(stripe_nr < rbio->real_stripes); + ASSERT(sector_nr < rbio->stripe_nsectors); + + return stripe_nr * rbio->stripe_nsectors + sector_nr; +} + +/* Return a sector from rbio->stripe_sectors, not from the bio list */ +static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr) +{ + return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, + sector_nr)]; +} + +/* Grab a sector inside P stripe */ +static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr) +{ + return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); +} + +/* Grab a sector inside Q stripe, return NULL if not RAID6 */ +static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr) +{ + if (rbio->nr_data + 1 == rbio->real_stripes) + return NULL; + return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); +} + +/* + * The first stripe in the table for a logical address + * has the lock. rbios are added in one of three ways: + * + * 1) Nobody has the stripe locked yet. The rbio is given + * the lock and 0 is returned. The caller must start the IO + * themselves. + * + * 2) Someone has the stripe locked, but we're able to merge + * with the lock owner. The rbio is freed and the IO will + * start automatically along with the existing rbio. 1 is returned. + * + * 3) Someone has the stripe locked, but we're not able to merge. + * The rbio is added to the lock owner's plug list, or merged into + * an rbio already on the plug list. When the lock owner unlocks, + * the next rbio on the list is run and the IO is started automatically. + * 1 is returned + * + * If we return 0, the caller still owns the rbio and must continue with + * IO submission. If we return 1, the caller must assume the rbio has + * already been freed. + */ +static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) +{ + struct btrfs_stripe_hash *h; + struct btrfs_raid_bio *cur; + struct btrfs_raid_bio *pending; + struct btrfs_raid_bio *freeit = NULL; + struct btrfs_raid_bio *cache_drop = NULL; + int ret = 0; + + h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); + + spin_lock(&h->lock); + list_for_each_entry(cur, &h->hash_list, hash_list) { + if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical) + continue; + + spin_lock(&cur->bio_list_lock); + + /* Can we steal this cached rbio's pages? */ + if (bio_list_empty(&cur->bio_list) && + list_empty(&cur->plug_list) && + test_bit(RBIO_CACHE_BIT, &cur->flags) && + !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { + list_del_init(&cur->hash_list); + refcount_dec(&cur->refs); + + steal_rbio(cur, rbio); + cache_drop = cur; + spin_unlock(&cur->bio_list_lock); + + goto lockit; + } + + /* Can we merge into the lock owner? */ + if (rbio_can_merge(cur, rbio)) { + merge_rbio(cur, rbio); + spin_unlock(&cur->bio_list_lock); + freeit = rbio; + ret = 1; + goto out; + } + + + /* + * We couldn't merge with the running rbio, see if we can merge + * with the pending ones. We don't have to check for rmw_locked + * because there is no way they are inside finish_rmw right now + */ + list_for_each_entry(pending, &cur->plug_list, plug_list) { + if (rbio_can_merge(pending, rbio)) { + merge_rbio(pending, rbio); + spin_unlock(&cur->bio_list_lock); + freeit = rbio; + ret = 1; + goto out; + } + } + + /* + * No merging, put us on the tail of the plug list, our rbio + * will be started with the currently running rbio unlocks + */ + list_add_tail(&rbio->plug_list, &cur->plug_list); + spin_unlock(&cur->bio_list_lock); + ret = 1; + goto out; + } +lockit: + refcount_inc(&rbio->refs); + list_add(&rbio->hash_list, &h->hash_list); +out: + spin_unlock(&h->lock); + if (cache_drop) + remove_rbio_from_cache(cache_drop); + if (freeit) + free_raid_bio(freeit); + return ret; +} + +static void recover_rbio_work_locked(struct work_struct *work); + +/* + * called as rmw or parity rebuild is completed. If the plug list has more + * rbios waiting for this stripe, the next one on the list will be started + */ +static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) +{ + int bucket; + struct btrfs_stripe_hash *h; + int keep_cache = 0; + + bucket = rbio_bucket(rbio); + h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; + + if (list_empty(&rbio->plug_list)) + cache_rbio(rbio); + + spin_lock(&h->lock); + spin_lock(&rbio->bio_list_lock); + + if (!list_empty(&rbio->hash_list)) { + /* + * if we're still cached and there is no other IO + * to perform, just leave this rbio here for others + * to steal from later + */ + if (list_empty(&rbio->plug_list) && + test_bit(RBIO_CACHE_BIT, &rbio->flags)) { + keep_cache = 1; + clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + BUG_ON(!bio_list_empty(&rbio->bio_list)); + goto done; + } + + list_del_init(&rbio->hash_list); + refcount_dec(&rbio->refs); + + /* + * we use the plug list to hold all the rbios + * waiting for the chance to lock this stripe. + * hand the lock over to one of them. + */ + if (!list_empty(&rbio->plug_list)) { + struct btrfs_raid_bio *next; + struct list_head *head = rbio->plug_list.next; + + next = list_entry(head, struct btrfs_raid_bio, + plug_list); + + list_del_init(&rbio->plug_list); + + list_add(&next->hash_list, &h->hash_list); + refcount_inc(&next->refs); + spin_unlock(&rbio->bio_list_lock); + spin_unlock(&h->lock); + + if (next->operation == BTRFS_RBIO_READ_REBUILD) { + start_async_work(next, recover_rbio_work_locked); + } else if (next->operation == BTRFS_RBIO_WRITE) { + steal_rbio(rbio, next); + start_async_work(next, rmw_rbio_work_locked); + } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { + steal_rbio(rbio, next); + start_async_work(next, scrub_rbio_work_locked); + } + + goto done_nolock; + } + } +done: + spin_unlock(&rbio->bio_list_lock); + spin_unlock(&h->lock); + +done_nolock: + if (!keep_cache) + remove_rbio_from_cache(rbio); +} + +static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) +{ + struct bio *next; + + while (cur) { + next = cur->bi_next; + cur->bi_next = NULL; + cur->bi_status = err; + bio_endio(cur); + cur = next; + } +} + +/* + * this frees the rbio and runs through all the bios in the + * bio_list and calls end_io on them + */ +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) +{ + struct bio *cur = bio_list_get(&rbio->bio_list); + struct bio *extra; + + kfree(rbio->csum_buf); + bitmap_free(rbio->csum_bitmap); + rbio->csum_buf = NULL; + rbio->csum_bitmap = NULL; + + /* + * Clear the data bitmap, as the rbio may be cached for later usage. + * do this before before unlock_stripe() so there will be no new bio + * for this bio. + */ + bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); + + /* + * At this moment, rbio->bio_list is empty, however since rbio does not + * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the + * hash list, rbio may be merged with others so that rbio->bio_list + * becomes non-empty. + * Once unlock_stripe() is done, rbio->bio_list will not be updated any + * more and we can call bio_endio() on all queued bios. + */ + unlock_stripe(rbio); + extra = bio_list_get(&rbio->bio_list); + free_raid_bio(rbio); + + rbio_endio_bio_list(cur, err); + if (extra) + rbio_endio_bio_list(extra, err); +} + +/* + * Get a sector pointer specified by its @stripe_nr and @sector_nr. + * + * @rbio: The raid bio + * @stripe_nr: Stripe number, valid range [0, real_stripe) + * @sector_nr: Sector number inside the stripe, + * valid range [0, stripe_nsectors) + * @bio_list_only: Whether to use sectors inside the bio list only. + * + * The read/modify/write code wants to reuse the original bio page as much + * as possible, and only use stripe_sectors as fallback. + */ +static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, + bool bio_list_only) +{ + struct sector_ptr *sector; + int index; + + ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); + ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); + + index = stripe_nr * rbio->stripe_nsectors + sector_nr; + ASSERT(index >= 0 && index < rbio->nr_sectors); + + spin_lock(&rbio->bio_list_lock); + sector = &rbio->bio_sectors[index]; + if (sector->page || bio_list_only) { + /* Don't return sector without a valid page pointer */ + if (!sector->page) + sector = NULL; + spin_unlock(&rbio->bio_list_lock); + return sector; + } + spin_unlock(&rbio->bio_list_lock); + + return &rbio->stripe_sectors[index]; +} + +/* + * allocation and initial setup for the btrfs_raid_bio. Not + * this does not allocate any pages for rbio->pages. + */ +static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, + struct btrfs_io_context *bioc) +{ + const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes; + const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; + const unsigned int num_pages = stripe_npages * real_stripes; + const unsigned int stripe_nsectors = + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; + const unsigned int num_sectors = stripe_nsectors * real_stripes; + struct btrfs_raid_bio *rbio; + + /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ + ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); + /* + * Our current stripe len should be fixed to 64k thus stripe_nsectors + * (at most 16) should be no larger than BITS_PER_LONG. + */ + ASSERT(stripe_nsectors <= BITS_PER_LONG); + + rbio = kzalloc(sizeof(*rbio), GFP_NOFS); + if (!rbio) + return ERR_PTR(-ENOMEM); + rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), + GFP_NOFS); + rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), + GFP_NOFS); + rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), + GFP_NOFS); + rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); + rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); + + if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || + !rbio->finish_pointers || !rbio->error_bitmap) { + free_raid_bio_pointers(rbio); + kfree(rbio); + return ERR_PTR(-ENOMEM); + } + + bio_list_init(&rbio->bio_list); + init_waitqueue_head(&rbio->io_wait); + INIT_LIST_HEAD(&rbio->plug_list); + spin_lock_init(&rbio->bio_list_lock); + INIT_LIST_HEAD(&rbio->stripe_cache); + INIT_LIST_HEAD(&rbio->hash_list); + btrfs_get_bioc(bioc); + rbio->bioc = bioc; + rbio->nr_pages = num_pages; + rbio->nr_sectors = num_sectors; + rbio->real_stripes = real_stripes; + rbio->stripe_npages = stripe_npages; + rbio->stripe_nsectors = stripe_nsectors; + refcount_set(&rbio->refs, 1); + atomic_set(&rbio->stripes_pending, 0); + + ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); + rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); + + return rbio; +} + +/* allocate pages for all the stripes in the bio, including parity */ +static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) +{ + int ret; + + ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); + if (ret < 0) + return ret; + /* Mapping all sectors */ + index_stripe_sectors(rbio); + return 0; +} + +/* only allocate pages for p/q stripes */ +static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) +{ + const int data_pages = rbio->nr_data * rbio->stripe_npages; + int ret; + + ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, + rbio->stripe_pages + data_pages); + if (ret < 0) + return ret; + + index_stripe_sectors(rbio); + return 0; +} + +/* + * Return the total number of errors found in the vertical stripe of @sector_nr. + * + * @faila and @failb will also be updated to the first and second stripe + * number of the errors. + */ +static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, + int *faila, int *failb) +{ + int stripe_nr; + int found_errors = 0; + + if (faila || failb) { + /* + * Both @faila and @failb should be valid pointers if any of + * them is specified. + */ + ASSERT(faila && failb); + *faila = -1; + *failb = -1; + } + + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr; + + if (test_bit(total_sector_nr, rbio->error_bitmap)) { + found_errors++; + if (faila) { + /* Update faila and failb. */ + if (*faila < 0) + *faila = stripe_nr; + else if (*failb < 0) + *failb = stripe_nr; + } + } + } + return found_errors; +} + +/* + * Add a single sector @sector into our list of bios for IO. + * + * Return 0 if everything went well. + * Return <0 for error. + */ +static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list, + struct sector_ptr *sector, + unsigned int stripe_nr, + unsigned int sector_nr, + enum req_op op) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + struct bio *last = bio_list->tail; + int ret; + struct bio *bio; + struct btrfs_io_stripe *stripe; + u64 disk_start; + + /* + * Note: here stripe_nr has taken device replace into consideration, + * thus it can be larger than rbio->real_stripe. + * So here we check against bioc->num_stripes, not rbio->real_stripes. + */ + ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); + ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); + ASSERT(sector->page); + + stripe = &rbio->bioc->stripes[stripe_nr]; + disk_start = stripe->physical + sector_nr * sectorsize; + + /* if the device is missing, just fail this stripe */ + if (!stripe->dev->bdev) { + int found_errors; + + set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr, + rbio->error_bitmap); + + /* Check if we have reached tolerance early. */ + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + NULL, NULL); + if (found_errors > rbio->bioc->max_errors) + return -EIO; + return 0; + } + + /* see if we can add this page onto our existing bio */ + if (last) { + u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT; + last_end += last->bi_iter.bi_size; + + /* + * we can't merge these if they are from different + * devices or if they are not contiguous + */ + if (last_end == disk_start && !last->bi_status && + last->bi_bdev == stripe->dev->bdev) { + ret = bio_add_page(last, sector->page, sectorsize, + sector->pgoff); + if (ret == sectorsize) + return 0; + } + } + + /* put a new bio on the list */ + bio = bio_alloc(stripe->dev->bdev, + max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), + op, GFP_NOFS); + bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; + bio->bi_private = rbio; + + __bio_add_page(bio, sector->page, sectorsize, sector->pgoff); + bio_list_add(bio_list, bio); + return 0; +} + +static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + struct bio_vec bvec; + struct bvec_iter iter; + u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + rbio->bioc->full_stripe_logical; + + bio_for_each_segment(bvec, bio, iter) { + u32 bvec_offset; + + for (bvec_offset = 0; bvec_offset < bvec.bv_len; + bvec_offset += sectorsize, offset += sectorsize) { + int index = offset / sectorsize; + struct sector_ptr *sector = &rbio->bio_sectors[index]; + + sector->page = bvec.bv_page; + sector->pgoff = bvec.bv_offset + bvec_offset; + ASSERT(sector->pgoff < PAGE_SIZE); + } + } +} + +/* + * helper function to walk our bio list and populate the bio_pages array with + * the result. This seems expensive, but it is faster than constantly + * searching through the bio list as we setup the IO in finish_rmw or stripe + * reconstruction. + * + * This must be called before you trust the answers from page_in_rbio + */ +static void index_rbio_pages(struct btrfs_raid_bio *rbio) +{ + struct bio *bio; + + spin_lock(&rbio->bio_list_lock); + bio_list_for_each(bio, &rbio->bio_list) + index_one_bio(rbio, bio); + + spin_unlock(&rbio->bio_list_lock); +} + +static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, + struct raid56_bio_trace_info *trace_info) +{ + const struct btrfs_io_context *bioc = rbio->bioc; + int i; + + ASSERT(bioc); + + /* We rely on bio->bi_bdev to find the stripe number. */ + if (!bio->bi_bdev) + goto not_found; + + for (i = 0; i < bioc->num_stripes; i++) { + if (bio->bi_bdev != bioc->stripes[i].dev->bdev) + continue; + trace_info->stripe_nr = i; + trace_info->devid = bioc->stripes[i].dev->devid; + trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + bioc->stripes[i].physical; + return; + } + +not_found: + trace_info->devid = -1; + trace_info->offset = -1; + trace_info->stripe_nr = -1; +} + +static inline void bio_list_put(struct bio_list *bio_list) +{ + struct bio *bio; + + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); +} + +/* Generate PQ for one vertical stripe. */ +static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) +{ + void **pointers = rbio->finish_pointers; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + struct sector_ptr *sector; + int stripe; + const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; + + /* First collect one sector from each data stripe */ + for (stripe = 0; stripe < rbio->nr_data; stripe++) { + sector = sector_in_rbio(rbio, stripe, sectornr, 0); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; + } + + /* Then add the parity stripe */ + sector = rbio_pstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; + + if (has_qstripe) { + /* + * RAID6, add the qstripe and call the library function + * to fill in our p/q + */ + sector = rbio_qstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + + sector->pgoff; + + raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, + pointers); + } else { + /* raid5 */ + memcpy(pointers[rbio->nr_data], pointers[0], sectorsize); + run_xor(pointers + 1, rbio->nr_data - 1, sectorsize); + } + for (stripe = stripe - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); +} + +static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + /* The total sector number inside the full stripe. */ + int total_sector_nr; + int sectornr; + int stripe; + int ret; + + ASSERT(bio_list_size(bio_list) == 0); + + /* We should have at least one data sector. */ + ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); + + /* + * Reset errors, as we may have errors inherited from from degraded + * write. + */ + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); + + /* + * Start assembly. Make bios for everything from the higher layers (the + * bio_list in our rbio) and our P/Q. Ignore everything else. + */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; + + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + + if (stripe < rbio->nr_data) { + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) + continue; + } else { + sector = rbio_stripe_sector(rbio, stripe, sectornr); + } + + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, + sectornr, REQ_OP_WRITE); + if (ret) + goto error; + } + + if (likely(!rbio->bioc->replace_nr_stripes)) + return 0; + + /* + * Make a copy for the replace target device. + * + * Thus the source stripe number (in replace_stripe_src) should be valid. + */ + ASSERT(rbio->bioc->replace_stripe_src >= 0); + + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; + + /* + * For RAID56, there is only one device that can be replaced, + * and replace_stripe_src[0] indicates the stripe number we + * need to copy from. + */ + if (stripe != rbio->bioc->replace_stripe_src) { + /* + * We can skip the whole stripe completely, note + * total_sector_nr will be increased by one anyway. + */ + ASSERT(sectornr == 0); + total_sector_nr += rbio->stripe_nsectors - 1; + continue; + } + + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + + if (stripe < rbio->nr_data) { + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) + continue; + } else { + sector = rbio_stripe_sector(rbio, stripe, sectornr); + } + + ret = rbio_add_io_sector(rbio, bio_list, sector, + rbio->real_stripes, + sectornr, REQ_OP_WRITE); + if (ret) + goto error; + } + + return 0; +error: + bio_list_put(bio_list); + return -EIO; +} + +static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + rbio->bioc->full_stripe_logical; + int total_nr_sector = offset >> fs_info->sectorsize_bits; + + ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); + + bitmap_set(rbio->error_bitmap, total_nr_sector, + bio->bi_iter.bi_size >> fs_info->sectorsize_bits); + + /* + * Special handling for raid56_alloc_missing_rbio() used by + * scrub/replace. Unlike call path in raid56_parity_recover(), they + * pass an empty bio here. Thus we have to find out the missing device + * and mark the stripe error instead. + */ + if (bio->bi_iter.bi_size == 0) { + bool found_missing = false; + int stripe_nr; + + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + if (!rbio->bioc->stripes[stripe_nr].dev->bdev) { + found_missing = true; + bitmap_set(rbio->error_bitmap, + stripe_nr * rbio->stripe_nsectors, + rbio->stripe_nsectors); + } + } + ASSERT(found_missing); + } +} + +/* + * For subpage case, we can no longer set page Up-to-date directly for + * stripe_pages[], thus we need to locate the sector. + */ +static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, + struct page *page, + unsigned int pgoff) +{ + int i; + + for (i = 0; i < rbio->nr_sectors; i++) { + struct sector_ptr *sector = &rbio->stripe_sectors[i]; + + if (sector->page == page && sector->pgoff == pgoff) + return sector; + } + return NULL; +} + +/* + * this sets each page in the bio uptodate. It should only be used on private + * rbio pages, nothing that comes in from the higher layers + */ +static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + ASSERT(!bio_flagged(bio, BIO_CLONED)); + + bio_for_each_segment_all(bvec, bio, iter_all) { + struct sector_ptr *sector; + int pgoff; + + for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; + pgoff += sectorsize) { + sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); + ASSERT(sector); + if (sector) + sector->uptodate = 1; + } + } +} + +static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) +{ + struct bio_vec *bv = bio_first_bvec_all(bio); + int i; + + for (i = 0; i < rbio->nr_sectors; i++) { + struct sector_ptr *sector; + + sector = &rbio->stripe_sectors[i]; + if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + break; + sector = &rbio->bio_sectors[i]; + if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + break; + } + ASSERT(i < rbio->nr_sectors); + return i; +} + +static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio) +{ + int total_sector_nr = get_bio_sector_nr(rbio, bio); + u32 bio_size = 0; + struct bio_vec *bvec; + int i; + + bio_for_each_bvec_all(bvec, bio, i) + bio_size += bvec->bv_len; + + /* + * Since we can have multiple bios touching the error_bitmap, we cannot + * call bitmap_set() without protection. + * + * Instead use set_bit() for each bit, as set_bit() itself is atomic. + */ + for (i = total_sector_nr; i < total_sector_nr + + (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++) + set_bit(i, rbio->error_bitmap); +} + +/* Verify the data sectors at read time. */ +static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + int total_sector_nr = get_bio_sector_nr(rbio, bio); + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + /* No data csum for the whole stripe, no need to verify. */ + if (!rbio->csum_bitmap || !rbio->csum_buf) + return; + + /* P/Q stripes, they have no data csum to verify against. */ + if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) + return; + + bio_for_each_segment_all(bvec, bio, iter_all) { + int bv_offset; + + for (bv_offset = bvec->bv_offset; + bv_offset < bvec->bv_offset + bvec->bv_len; + bv_offset += fs_info->sectorsize, total_sector_nr++) { + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *expected_csum = rbio->csum_buf + + total_sector_nr * fs_info->csum_size; + int ret; + + /* No csum for this sector, skip to the next sector. */ + if (!test_bit(total_sector_nr, rbio->csum_bitmap)) + continue; + + ret = btrfs_check_sector_csum(fs_info, bvec->bv_page, + bv_offset, csum_buf, expected_csum); + if (ret < 0) + set_bit(total_sector_nr, rbio->error_bitmap); + } + } +} + +static void raid_wait_read_end_io(struct bio *bio) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (bio->bi_status) { + rbio_update_error_bitmap(rbio, bio); + } else { + set_bio_pages_uptodate(rbio, bio); + verify_bio_data_sectors(rbio, bio); + } + + bio_put(bio); + if (atomic_dec_and_test(&rbio->stripes_pending)) + wake_up(&rbio->io_wait); +} + +static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; + + atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); + while ((bio = bio_list_pop(bio_list))) { + bio->bi_end_io = raid_wait_read_end_io; + + if (trace_raid56_read_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_read(rbio, bio, &trace_info); + } + submit_bio(bio); + } + + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); +} + +static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) +{ + const int data_pages = rbio->nr_data * rbio->stripe_npages; + int ret; + + ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages); + if (ret < 0) + return ret; + + index_stripe_sectors(rbio); + return 0; +} + +/* + * We use plugging call backs to collect full stripes. + * Any time we get a partial stripe write while plugged + * we collect it into a list. When the unplug comes down, + * we sort the list by logical block number and merge + * everything we can into the same rbios + */ +struct btrfs_plug_cb { + struct blk_plug_cb cb; + struct btrfs_fs_info *info; + struct list_head rbio_list; + struct work_struct work; +}; + +/* + * rbios on the plug list are sorted for easier merging. + */ +static int plug_cmp(void *priv, const struct list_head *a, + const struct list_head *b) +{ + const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, + plug_list); + const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, + plug_list); + u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; + u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; + + if (a_sector < b_sector) + return -1; + if (a_sector > b_sector) + return 1; + return 0; +} + +static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb); + struct btrfs_raid_bio *cur; + struct btrfs_raid_bio *last = NULL; + + list_sort(NULL, &plug->rbio_list, plug_cmp); + + while (!list_empty(&plug->rbio_list)) { + cur = list_entry(plug->rbio_list.next, + struct btrfs_raid_bio, plug_list); + list_del_init(&cur->plug_list); + + if (rbio_is_full(cur)) { + /* We have a full stripe, queue it down. */ + start_async_work(cur, rmw_rbio_work); + continue; + } + if (last) { + if (rbio_can_merge(last, cur)) { + merge_rbio(last, cur); + free_raid_bio(cur); + continue; + } + start_async_work(last, rmw_rbio_work); + } + last = cur; + } + if (last) + start_async_work(last, rmw_rbio_work); + kfree(plug); +} + +/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ +static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) +{ + const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; + const u64 full_stripe_start = rbio->bioc->full_stripe_logical; + const u32 orig_len = orig_bio->bi_iter.bi_size; + const u32 sectorsize = fs_info->sectorsize; + u64 cur_logical; + + ASSERT(orig_logical >= full_stripe_start && + orig_logical + orig_len <= full_stripe_start + + rbio->nr_data * BTRFS_STRIPE_LEN); + + bio_list_add(&rbio->bio_list, orig_bio); + rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; + + /* Update the dbitmap. */ + for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; + cur_logical += sectorsize) { + int bit = ((u32)(cur_logical - full_stripe_start) >> + fs_info->sectorsize_bits) % rbio->stripe_nsectors; + + set_bit(bit, &rbio->dbitmap); + } +} + +/* + * our main entry point for writes from the rest of the FS. + */ +void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) +{ + struct btrfs_fs_info *fs_info = bioc->fs_info; + struct btrfs_raid_bio *rbio; + struct btrfs_plug_cb *plug = NULL; + struct blk_plug_cb *cb; + + rbio = alloc_rbio(fs_info, bioc); + if (IS_ERR(rbio)) { + bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); + bio_endio(bio); + return; + } + rbio->operation = BTRFS_RBIO_WRITE; + rbio_add_bio(rbio, bio); + + /* + * Don't plug on full rbios, just get them out the door + * as quickly as we can + */ + if (!rbio_is_full(rbio)) { + cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); + if (cb) { + plug = container_of(cb, struct btrfs_plug_cb, cb); + if (!plug->info) { + plug->info = fs_info; + INIT_LIST_HEAD(&plug->rbio_list); + } + list_add_tail(&rbio->plug_list, &plug->rbio_list); + return; + } + } + + /* + * Either we don't have any existing plug, or we're doing a full stripe, + * queue the rmw work now. + */ + start_async_work(rbio, rmw_rbio_work); +} + +static int verify_one_sector(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct sector_ptr *sector; + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *csum_expected; + int ret; + + if (!rbio->csum_bitmap || !rbio->csum_buf) + return 0; + + /* No way to verify P/Q as they are not covered by data csum. */ + if (stripe_nr >= rbio->nr_data) + return 0; + /* + * If we're rebuilding a read, we have to use pages from the + * bio list if possible. + */ + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { + sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + } else { + sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); + } + + ASSERT(sector->page); + + csum_expected = rbio->csum_buf + + (stripe_nr * rbio->stripe_nsectors + sector_nr) * + fs_info->csum_size; + ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff, + csum_buf, csum_expected); + return ret; +} + +/* + * Recover a vertical stripe specified by @sector_nr. + * @*pointers are the pre-allocated pointers by the caller, so we don't + * need to allocate/free the pointers again and again. + */ +static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, + void **pointers, void **unmap_array) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct sector_ptr *sector; + const u32 sectorsize = fs_info->sectorsize; + int found_errors; + int faila; + int failb; + int stripe_nr; + int ret = 0; + + /* + * Now we just use bitmap to mark the horizontal stripes in + * which we have data when doing parity scrub. + */ + if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && + !test_bit(sector_nr, &rbio->dbitmap)) + return 0; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, + &failb); + /* + * No errors in the vertical stripe, skip it. Can happen for recovery + * which only part of a stripe failed csum check. + */ + if (!found_errors) + return 0; + + if (found_errors > rbio->bioc->max_errors) + return -EIO; + + /* + * Setup our array of pointers with sectors from each stripe + * + * NOTE: store a duplicate array of pointers to preserve the + * pointer order. + */ + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + /* + * If we're rebuilding a read, we have to use pages from the + * bio list if possible. + */ + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { + sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + } else { + sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); + } + ASSERT(sector->page); + pointers[stripe_nr] = kmap_local_page(sector->page) + + sector->pgoff; + unmap_array[stripe_nr] = pointers[stripe_nr]; + } + + /* All raid6 handling here */ + if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { + /* Single failure, rebuild from parity raid5 style */ + if (failb < 0) { + if (faila == rbio->nr_data) + /* + * Just the P stripe has failed, without + * a bad data or Q stripe. + * We have nothing to do, just skip the + * recovery for this stripe. + */ + goto cleanup; + /* + * a single failure in raid6 is rebuilt + * in the pstripe code below + */ + goto pstripe; + } + + /* + * If the q stripe is failed, do a pstripe reconstruction from + * the xors. + * If both the q stripe and the P stripe are failed, we're + * here due to a crc mismatch and we can't give them the + * data they want. + */ + if (failb == rbio->real_stripes - 1) { + if (faila == rbio->real_stripes - 2) + /* + * Only P and Q are corrupted. + * We only care about data stripes recovery, + * can skip this vertical stripe. + */ + goto cleanup; + /* + * Otherwise we have one bad data stripe and + * a good P stripe. raid5! + */ + goto pstripe; + } + + if (failb == rbio->real_stripes - 2) { + raid6_datap_recov(rbio->real_stripes, sectorsize, + faila, pointers); + } else { + raid6_2data_recov(rbio->real_stripes, sectorsize, + faila, failb, pointers); + } + } else { + void *p; + + /* Rebuild from P stripe here (raid5 or raid6). */ + ASSERT(failb == -1); +pstripe: + /* Copy parity block into failed block to start with */ + memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); + + /* Rearrange the pointer array */ + p = pointers[faila]; + for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1; + stripe_nr++) + pointers[stripe_nr] = pointers[stripe_nr + 1]; + pointers[rbio->nr_data - 1] = p; + + /* Xor in the rest */ + run_xor(pointers, rbio->nr_data - 1, sectorsize); + + } + + /* + * No matter if this is a RMW or recovery, we should have all + * failed sectors repaired in the vertical stripe, thus they are now + * uptodate. + * Especially if we determine to cache the rbio, we need to + * have at least all data sectors uptodate. + * + * If possible, also check if the repaired sector matches its data + * checksum. + */ + if (faila >= 0) { + ret = verify_one_sector(rbio, faila, sector_nr); + if (ret < 0) + goto cleanup; + + sector = rbio_stripe_sector(rbio, faila, sector_nr); + sector->uptodate = 1; + } + if (failb >= 0) { + ret = verify_one_sector(rbio, failb, sector_nr); + if (ret < 0) + goto cleanup; + + sector = rbio_stripe_sector(rbio, failb, sector_nr); + sector->uptodate = 1; + } + +cleanup: + for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) + kunmap_local(unmap_array[stripe_nr]); + return ret; +} + +static int recover_sectors(struct btrfs_raid_bio *rbio) +{ + void **pointers = NULL; + void **unmap_array = NULL; + int sectornr; + int ret = 0; + + /* + * @pointers array stores the pointer for each sector. + * + * @unmap_array stores copy of pointers that does not get reordered + * during reconstruction so that kunmap_local works. + */ + pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + if (!pointers || !unmap_array) { + ret = -ENOMEM; + goto out; + } + + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { + spin_lock(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock(&rbio->bio_list_lock); + } + + index_rbio_pages(rbio); + + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + ret = recover_vertical(rbio, sectornr, pointers, unmap_array); + if (ret < 0) + break; + } + +out: + kfree(pointers); + kfree(unmap_array); + return ret; +} + +static void recover_rbio(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list = BIO_EMPTY_LIST; + int total_sector_nr; + int ret = 0; + + /* + * Either we're doing recover for a read failure or degraded write, + * caller should have set error bitmap correctly. + */ + ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); + + /* For recovery, we need to read all sectors including P/Q. */ + ret = alloc_rbio_pages(rbio); + if (ret < 0) + goto out; + + index_rbio_pages(rbio); + + /* + * Read everything that hasn't failed. However this time we will + * not trust any cached sector. + * As we may read out some stale data but higher layer is not reading + * that stale part. + * + * So here we always re-read everything in recovery path. + */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + struct sector_ptr *sector; + + /* + * Skip the range which has error. It can be a range which is + * marked error (for csum mismatch), or it can be a missing + * device. + */ + if (!rbio->bioc->stripes[stripe].dev->bdev || + test_bit(total_sector_nr, rbio->error_bitmap)) { + /* + * Also set the error bit for missing device, which + * may not yet have its error bit set. + */ + set_bit(total_sector_nr, rbio->error_bitmap); + continue; + } + + sector = rbio_stripe_sector(rbio, stripe, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, REQ_OP_READ); + if (ret < 0) { + bio_list_put(&bio_list); + goto out; + } + } + + submit_read_wait_bio_list(rbio, &bio_list); + ret = recover_sectors(rbio); +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); +} + +static void recover_rbio_work(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + if (!lock_stripe_add(rbio)) + recover_rbio(rbio); +} + +static void recover_rbio_work_locked(struct work_struct *work) +{ + recover_rbio(container_of(work, struct btrfs_raid_bio, work)); +} + +static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) +{ + bool found = false; + int sector_nr; + + /* + * This is for RAID6 extra recovery tries, thus mirror number should + * be large than 2. + * Mirror 1 means read from data stripes. Mirror 2 means rebuild using + * RAID5 methods. + */ + ASSERT(mirror_num > 2); + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { + int found_errors; + int faila; + int failb; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + &faila, &failb); + /* This vertical stripe doesn't have errors. */ + if (!found_errors) + continue; + + /* + * If we found errors, there should be only one error marked + * by previous set_rbio_range_error(). + */ + ASSERT(found_errors == 1); + found = true; + + /* Now select another stripe to mark as error. */ + failb = rbio->real_stripes - (mirror_num - 1); + if (failb <= faila) + failb--; + + /* Set the extra bit in error bitmap. */ + if (failb >= 0) + set_bit(failb * rbio->stripe_nsectors + sector_nr, + rbio->error_bitmap); + } + + /* We should found at least one vertical stripe with error.*/ + ASSERT(found); +} + +/* + * the main entry point for reads from the higher layers. This + * is really only called when the normal read path had a failure, + * so we assume the bio they send down corresponds to a failed part + * of the drive. + */ +void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + int mirror_num) +{ + struct btrfs_fs_info *fs_info = bioc->fs_info; + struct btrfs_raid_bio *rbio; + + rbio = alloc_rbio(fs_info, bioc); + if (IS_ERR(rbio)) { + bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); + bio_endio(bio); + return; + } + + rbio->operation = BTRFS_RBIO_READ_REBUILD; + rbio_add_bio(rbio, bio); + + set_rbio_range_error(rbio, bio); + + /* + * Loop retry: + * for 'mirror == 2', reconstruct from all other stripes. + * for 'mirror_num > 2', select a stripe to fail on every retry. + */ + if (mirror_num > 2) + set_rbio_raid6_extra_error(rbio, mirror_num); + + start_async_work(rbio, recover_rbio_work); +} + +static void fill_data_csums(struct btrfs_raid_bio *rbio) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, + rbio->bioc->full_stripe_logical); + const u64 start = rbio->bioc->full_stripe_logical; + const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << + fs_info->sectorsize_bits; + int ret; + + /* The rbio should not have its csum buffer initialized. */ + ASSERT(!rbio->csum_buf && !rbio->csum_bitmap); + + /* + * Skip the csum search if: + * + * - The rbio doesn't belong to data block groups + * Then we are doing IO for tree blocks, no need to search csums. + * + * - The rbio belongs to mixed block groups + * This is to avoid deadlock, as we're already holding the full + * stripe lock, if we trigger a metadata read, and it needs to do + * raid56 recovery, we will deadlock. + */ + if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) || + rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA) + return; + + rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors * + fs_info->csum_size, GFP_NOFS); + rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors, + GFP_NOFS); + if (!rbio->csum_buf || !rbio->csum_bitmap) { + ret = -ENOMEM; + goto error; + } + + ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1, + rbio->csum_buf, rbio->csum_bitmap); + if (ret < 0) + goto error; + if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) + goto no_csum; + return; + +error: + /* + * We failed to allocate memory or grab the csum, but it's not fatal, + * we can still continue. But better to warn users that RMW is no + * longer safe for this particular sub-stripe write. + */ + btrfs_warn_rl(fs_info, +"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", + rbio->bioc->full_stripe_logical, ret); +no_csum: + kfree(rbio->csum_buf); + bitmap_free(rbio->csum_bitmap); + rbio->csum_buf = NULL; + rbio->csum_bitmap = NULL; +} + +static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list = BIO_EMPTY_LIST; + int total_sector_nr; + int ret = 0; + + /* + * Fill the data csums we need for data verification. We need to fill + * the csum_bitmap/csum_buf first, as our endio function will try to + * verify the data sectors. + */ + fill_data_csums(rbio); + + /* + * Build a list of bios to read all sectors (including data and P/Q). + * + * This behavior is to compensate the later csum verification and recovery. + */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + + sector = rbio_stripe_sector(rbio, stripe, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, REQ_OP_READ); + if (ret) { + bio_list_put(&bio_list); + return ret; + } + } + + /* + * We may or may not have any corrupted sectors (including missing dev + * and csum mismatch), just let recover_sectors() to handle them all. + */ + submit_read_wait_bio_list(rbio, &bio_list); + return recover_sectors(rbio); +} + +static void raid_wait_write_end_io(struct bio *bio) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + blk_status_t err = bio->bi_status; + + if (err) + rbio_update_error_bitmap(rbio, bio); + bio_put(bio); + if (atomic_dec_and_test(&rbio->stripes_pending)) + wake_up(&rbio->io_wait); +} + +static void submit_write_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; + + atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); + while ((bio = bio_list_pop(bio_list))) { + bio->bi_end_io = raid_wait_write_end_io; + + if (trace_raid56_write_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_write(rbio, bio, &trace_info); + } + submit_bio(bio); + } +} + +/* + * To determine if we need to read any sector from the disk. + * Should only be utilized in RMW path, to skip cached rbio. + */ +static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) +{ + int i; + + for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { + struct sector_ptr *sector = &rbio->stripe_sectors[i]; + + /* + * We have a sector which doesn't have page nor uptodate, + * thus this rbio can not be cached one, as cached one must + * have all its data sectors present and uptodate. + */ + if (!sector->page || !sector->uptodate) + return true; + } + return false; +} + +static void rmw_rbio(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list; + int sectornr; + int ret = 0; + + /* + * Allocate the pages for parity first, as P/Q pages will always be + * needed for both full-stripe and sub-stripe writes. + */ + ret = alloc_rbio_parity_pages(rbio); + if (ret < 0) + goto out; + + /* + * Either full stripe write, or we have every data sector already + * cached, can go to write path immediately. + */ + if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) { + /* + * Now we're doing sub-stripe write, also need all data stripes + * to do the full RMW. + */ + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + goto out; + + index_rbio_pages(rbio); + + ret = rmw_read_wait_recover(rbio); + if (ret < 0) + goto out; + } + + /* + * At this stage we're not allowed to add any new bios to the + * bio list any more, anyone else that wants to change this stripe + * needs to do their own rmw. + */ + spin_lock(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock(&rbio->bio_list_lock); + + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); + + index_rbio_pages(rbio); + + /* + * We don't cache full rbios because we're assuming + * the higher layers are unlikely to use this area of + * the disk again soon. If they do use it again, + * hopefully they will send another full bio. + */ + if (!rbio_is_full(rbio)) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) + generate_pq_vertical(rbio, sectornr); + + bio_list_init(&bio_list); + ret = rmw_assemble_write_bios(rbio, &bio_list); + if (ret < 0) + goto out; + + /* We should have at least one bio assembled. */ + ASSERT(bio_list_size(&bio_list)); + submit_write_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + /* We may have more errors than our tolerance during the read. */ + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + int found_errors; + + found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + break; + } + } +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); +} + +static void rmw_rbio_work(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + if (lock_stripe_add(rbio) == 0) + rmw_rbio(rbio); +} + +static void rmw_rbio_work_locked(struct work_struct *work) +{ + rmw_rbio(container_of(work, struct btrfs_raid_bio, work)); +} + +/* + * The following code is used to scrub/replace the parity stripe + * + * Caller must have already increased bio_counter for getting @bioc. + * + * Note: We need make sure all the pages that add into the scrub/replace + * raid bio are correct and not be changed during the scrub/replace. That + * is those pages just hold metadata or file data with checksum. + */ + +struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, + struct btrfs_io_context *bioc, + struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors) +{ + struct btrfs_fs_info *fs_info = bioc->fs_info; + struct btrfs_raid_bio *rbio; + int i; + + rbio = alloc_rbio(fs_info, bioc); + if (IS_ERR(rbio)) + return NULL; + bio_list_add(&rbio->bio_list, bio); + /* + * This is a special bio which is used to hold the completion handler + * and make the scrub rbio is similar to the other types + */ + ASSERT(!bio->bi_iter.bi_size); + rbio->operation = BTRFS_RBIO_PARITY_SCRUB; + + /* + * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted + * to the end position, so this search can start from the first parity + * stripe. + */ + for (i = rbio->nr_data; i < rbio->real_stripes; i++) { + if (bioc->stripes[i].dev == scrub_dev) { + rbio->scrubp = i; + break; + } + } + ASSERT(i < rbio->real_stripes); + + bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); + return rbio; +} + +/* + * We just scrub the parity that we have correct data on the same horizontal, + * so we needn't allocate all pages for all the stripes. + */ +static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + int total_sector_nr; + + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct page *page; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; + + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + if (rbio->stripe_pages[index]) + continue; + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + rbio->stripe_pages[index] = page; + } + index_stripe_sectors(rbio); + return 0; +} + +static int finish_parity_scrub(struct btrfs_raid_bio *rbio) +{ + struct btrfs_io_context *bioc = rbio->bioc; + const u32 sectorsize = bioc->fs_info->sectorsize; + void **pointers = rbio->finish_pointers; + unsigned long *pbitmap = &rbio->finish_pbitmap; + int nr_data = rbio->nr_data; + int stripe; + int sectornr; + bool has_qstripe; + struct sector_ptr p_sector = { 0 }; + struct sector_ptr q_sector = { 0 }; + struct bio_list bio_list; + int is_replace = 0; + int ret; + + bio_list_init(&bio_list); + + if (rbio->real_stripes - rbio->nr_data == 1) + has_qstripe = false; + else if (rbio->real_stripes - rbio->nr_data == 2) + has_qstripe = true; + else + BUG(); + + /* + * Replace is running and our P/Q stripe is being replaced, then we + * need to duplicate the final write to replace target. + */ + if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) { + is_replace = 1; + bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); + } + + /* + * Because the higher layers(scrubber) are unlikely to + * use this area of the disk again soon, so don't cache + * it. + */ + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + p_sector.page = alloc_page(GFP_NOFS); + if (!p_sector.page) + return -ENOMEM; + p_sector.pgoff = 0; + p_sector.uptodate = 1; + + if (has_qstripe) { + /* RAID6, allocate and map temp space for the Q stripe */ + q_sector.page = alloc_page(GFP_NOFS); + if (!q_sector.page) { + __free_page(p_sector.page); + p_sector.page = NULL; + return -ENOMEM; + } + q_sector.pgoff = 0; + q_sector.uptodate = 1; + pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); + } + + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); + + /* Map the parity stripe just once */ + pointers[nr_data] = kmap_local_page(p_sector.page); + + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; + void *parity; + + /* first collect one page from each data stripe */ + for (stripe = 0; stripe < nr_data; stripe++) { + sector = sector_in_rbio(rbio, stripe, sectornr, 0); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; + } + + if (has_qstripe) { + /* RAID6, call the library function to fill in our P/Q */ + raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, + pointers); + } else { + /* raid5 */ + memcpy(pointers[nr_data], pointers[0], sectorsize); + run_xor(pointers + 1, nr_data - 1, sectorsize); + } + + /* Check scrubbing parity and repair it */ + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); + parity = kmap_local_page(sector->page) + sector->pgoff; + if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) + memcpy(parity, pointers[rbio->scrubp], sectorsize); + else + /* Parity is right, needn't writeback */ + bitmap_clear(&rbio->dbitmap, sectornr, 1); + kunmap_local(parity); + + for (stripe = nr_data - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); + } + + kunmap_local(pointers[nr_data]); + __free_page(p_sector.page); + p_sector.page = NULL; + if (q_sector.page) { + kunmap_local(pointers[rbio->real_stripes - 1]); + __free_page(q_sector.page); + q_sector.page = NULL; + } + + /* + * time to start writing. Make bios for everything from the + * higher layers (the bio_list in our rbio) and our p/q. Ignore + * everything else. + */ + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; + + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, + sectornr, REQ_OP_WRITE); + if (ret) + goto cleanup; + } + + if (!is_replace) + goto submit_write; + + /* + * Replace is running and our parity stripe needs to be duplicated to + * the target device. Check we have a valid source stripe number. + */ + ASSERT(rbio->bioc->replace_stripe_src >= 0); + for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; + + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, + rbio->real_stripes, + sectornr, REQ_OP_WRITE); + if (ret) + goto cleanup; + } + +submit_write: + submit_write_bios(rbio, &bio_list); + return 0; + +cleanup: + bio_list_put(&bio_list); + return ret; +} + +static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) +{ + if (stripe >= 0 && stripe < rbio->nr_data) + return 1; + return 0; +} + +static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) +{ + void **pointers = NULL; + void **unmap_array = NULL; + int sector_nr; + int ret = 0; + + /* + * @pointers array stores the pointer for each sector. + * + * @unmap_array stores copy of pointers that does not get reordered + * during reconstruction so that kunmap_local works. + */ + pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + if (!pointers || !unmap_array) { + ret = -ENOMEM; + goto out; + } + + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { + int dfail = 0, failp = -1; + int faila; + int failb; + int found_errors; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + &faila, &failb); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + goto out; + } + if (found_errors == 0) + continue; + + /* We should have at least one error here. */ + ASSERT(faila >= 0 || failb >= 0); + + if (is_data_stripe(rbio, faila)) + dfail++; + else if (is_parity_stripe(faila)) + failp = faila; + + if (is_data_stripe(rbio, failb)) + dfail++; + else if (is_parity_stripe(failb)) + failp = failb; + /* + * Because we can not use a scrubbing parity to repair the + * data, so the capability of the repair is declined. (In the + * case of RAID5, we can not repair anything.) + */ + if (dfail > rbio->bioc->max_errors - 1) { + ret = -EIO; + goto out; + } + /* + * If all data is good, only parity is correctly, just repair + * the parity, no need to recover data stripes. + */ + if (dfail == 0) + continue; + + /* + * Here means we got one corrupted data stripe and one + * corrupted parity on RAID6, if the corrupted parity is + * scrubbing parity, luckily, use the other one to repair the + * data, or we can not repair the data stripe. + */ + if (failp != rbio->scrubp) { + ret = -EIO; + goto out; + } + + ret = recover_vertical(rbio, sector_nr, pointers, unmap_array); + if (ret < 0) + goto out; + } +out: + kfree(pointers); + kfree(unmap_array); + return ret; +} + +static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list = BIO_EMPTY_LIST; + int total_sector_nr; + int ret = 0; + + /* Build a list of bios to read all the missing parts. */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int sectornr = total_sector_nr % rbio->stripe_nsectors; + int stripe = total_sector_nr / rbio->stripe_nsectors; + struct sector_ptr *sector; + + /* No data in the vertical stripe, no need to read. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + + /* + * We want to find all the sectors missing from the rbio and + * read them from the disk. If sector_in_rbio() finds a sector + * in the bio list we don't need to read it off the stripe. + */ + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (sector) + continue; + + sector = rbio_stripe_sector(rbio, stripe, sectornr); + /* + * The bio cache may have handed us an uptodate sector. If so, + * use it. + */ + if (sector->uptodate) + continue; + + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, REQ_OP_READ); + if (ret) { + bio_list_put(&bio_list); + return ret; + } + } + + submit_read_wait_bio_list(rbio, &bio_list); + return 0; +} + +static void scrub_rbio(struct btrfs_raid_bio *rbio) +{ + int sector_nr; + int ret; + + ret = alloc_rbio_essential_pages(rbio); + if (ret) + goto out; + + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); + + ret = scrub_assemble_read_bios(rbio); + if (ret < 0) + goto out; + + /* We may have some failures, recover the failed sectors first. */ + ret = recover_scrub_rbio(rbio); + if (ret < 0) + goto out; + + /* + * We have every sector properly prepared. Can finish the scrub + * and writeback the good content. + */ + ret = finish_parity_scrub(rbio); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { + int found_errors; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + break; + } + } +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); +} + +static void scrub_rbio_work_locked(struct work_struct *work) +{ + scrub_rbio(container_of(work, struct btrfs_raid_bio, work)); +} + +void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) +{ + if (!lock_stripe_add(rbio)) + start_async_work(rbio, scrub_rbio_work_locked); +} + +/* + * This is for scrub call sites where we already have correct data contents. + * This allows us to avoid reading data stripes again. + * + * Unfortunately here we have to do page copy, other than reusing the pages. + * This is due to the fact rbio has its own page management for its cache. + */ +void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, + struct page **data_pages, u64 data_logical) +{ + const u64 offset_in_full_stripe = data_logical - + rbio->bioc->full_stripe_logical; + const int page_index = offset_in_full_stripe >> PAGE_SHIFT; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 sectors_per_page = PAGE_SIZE / sectorsize; + int ret; + + /* + * If we hit ENOMEM temporarily, but later at + * raid56_parity_submit_scrub_rbio() time it succeeded, we just do + * the extra read, not a big deal. + * + * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time, + * the bio would got proper error number set. + */ + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + return; + + /* data_logical must be at stripe boundary and inside the full stripe. */ + ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN)); + ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT)); + + for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) { + struct page *dst = rbio->stripe_pages[page_nr + page_index]; + struct page *src = data_pages[page_nr]; + + memcpy_page(dst, 0, src, 0, PAGE_SIZE); + for (int sector_nr = sectors_per_page * page_index; + sector_nr < sectors_per_page * (page_index + 1); + sector_nr++) + rbio->stripe_sectors[sector_nr].uptodate = true; + } +} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h new file mode 100644 index 0000000000..45e6ff7831 --- /dev/null +++ b/fs/btrfs/raid56.h @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2012 Fusion-io All rights reserved. + * Copyright (C) 2012 Intel Corp. All rights reserved. + */ + +#ifndef BTRFS_RAID56_H +#define BTRFS_RAID56_H + +#include +#include "volumes.h" + +enum btrfs_rbio_ops { + BTRFS_RBIO_WRITE, + BTRFS_RBIO_READ_REBUILD, + BTRFS_RBIO_PARITY_SCRUB, +}; + +struct btrfs_raid_bio { + struct btrfs_io_context *bioc; + + /* + * While we're doing RMW on a stripe we put it into a hash table so we + * can lock the stripe and merge more rbios into it. + */ + struct list_head hash_list; + + /* LRU list for the stripe cache */ + struct list_head stripe_cache; + + /* For scheduling work in the helper threads */ + struct work_struct work; + + /* + * bio_list and bio_list_lock are used to add more bios into the stripe + * in hopes of avoiding the full RMW + */ + struct bio_list bio_list; + spinlock_t bio_list_lock; + + /* + * Also protected by the bio_list_lock, the plug list is used by the + * plugging code to collect partial bios while plugged. The stripe + * locking code also uses it to hand off the stripe lock to the next + * pending IO. + */ + struct list_head plug_list; + + /* Flags that tell us if it is safe to merge with this bio. */ + unsigned long flags; + + /* + * Set if we're doing a parity rebuild for a read from higher up, which + * is handled differently from a parity rebuild as part of RMW. + */ + enum btrfs_rbio_ops operation; + + /* How many pages there are for the full stripe including P/Q */ + u16 nr_pages; + + /* How many sectors there are for the full stripe including P/Q */ + u16 nr_sectors; + + /* Number of data stripes (no p/q) */ + u8 nr_data; + + /* Number of all stripes (including P/Q) */ + u8 real_stripes; + + /* How many pages there are for each stripe */ + u8 stripe_npages; + + /* How many sectors there are for each stripe */ + u8 stripe_nsectors; + + /* Stripe number that we're scrubbing */ + u8 scrubp; + + /* + * Size of all the bios in the bio_list. This helps us decide if the + * rbio maps to a full stripe or not. + */ + int bio_list_bytes; + + refcount_t refs; + + atomic_t stripes_pending; + + wait_queue_head_t io_wait; + + /* Bitmap to record which horizontal stripe has data */ + unsigned long dbitmap; + + /* Allocated with stripe_nsectors-many bits for finish_*() calls */ + unsigned long finish_pbitmap; + + /* + * These are two arrays of pointers. We allocate the rbio big enough + * to hold them both and setup their locations when the rbio is + * allocated. + */ + + /* + * Pointers to pages that we allocated for reading/writing stripes + * directly from the disk (including P/Q). + */ + struct page **stripe_pages; + + /* Pointers to the sectors in the bio_list, for faster lookup */ + struct sector_ptr *bio_sectors; + + /* + * For subpage support, we need to map each sector to above + * stripe_pages. + */ + struct sector_ptr *stripe_sectors; + + /* Allocated with real_stripes-many pointers for finish_*() calls */ + void **finish_pointers; + + /* + * The bitmap recording where IO errors happened. + * Each bit is corresponding to one sector in either bio_sectors[] or + * stripe_sectors[] array. + * + * The reason we don't use another bit in sector_ptr is, we have two + * arrays of sectors, and a lot of IO can use sectors in both arrays. + * Thus making it much harder to iterate. + */ + unsigned long *error_bitmap; + + /* + * Checksum buffer if the rbio is for data. The buffer should cover + * all data sectors (excluding P/Q sectors). + */ + u8 *csum_buf; + + /* + * Each bit represents if the corresponding sector has data csum found. + * Should only cover data sectors (excluding P/Q sectors). + */ + unsigned long *csum_bitmap; +}; + +/* + * For trace event usage only. Records useful debug info for each bio submitted + * by RAID56 to each physical device. + * + * No matter signed or not, (-1) is always the one indicating we can not grab + * the proper stripe number. + */ +struct raid56_bio_trace_info { + u64 devid; + + /* The offset inside the stripe. (<= STRIPE_LEN) */ + u32 offset; + + /* + * Stripe number. + * 0 is the first data stripe, and nr_data for P stripe, + * nr_data + 1 for Q stripe. + * >= real_stripes for + */ + u8 stripe_nr; +}; + +static inline int nr_data_stripes(const struct map_lookup *map) +{ + return map->num_stripes - btrfs_nr_parity_stripes(map->type); +} + +static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc) +{ + return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type); +} + +#define RAID5_P_STRIPE ((u64)-2) +#define RAID6_Q_STRIPE ((u64)-1) + +#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ + ((x) == RAID6_Q_STRIPE)) + +struct btrfs_device; + +void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + int mirror_num); +void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); + +struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, + struct btrfs_io_context *bioc, + struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors); +void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); + +void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, + struct page **data_pages, u64 data_logical); + +int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); +void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); + +#endif diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h new file mode 100644 index 0000000000..5c2b66d155 --- /dev/null +++ b/fs/btrfs/rcu-string.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + */ + +#ifndef BTRFS_RCU_STRING_H +#define BTRFS_RCU_STRING_H + +struct rcu_string { + struct rcu_head rcu; + char str[]; +}; + +static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) +{ + size_t len = strlen(src) + 1; + struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) + + (len * sizeof(char)), mask); + if (!ret) + return ret; + /* Warn if the source got unexpectedly truncated. */ + if (WARN_ON(strscpy(ret->str, src, len) < 0)) { + kfree(ret); + return NULL; + } + return ret; +} + +static inline void rcu_string_free(struct rcu_string *str) +{ + if (str) + kfree_rcu(str, rcu); +} + +#define printk_in_rcu(fmt, ...) do { \ + rcu_read_lock(); \ + printk(fmt, __VA_ARGS__); \ + rcu_read_unlock(); \ +} while (0) + +#define printk_ratelimited_in_rcu(fmt, ...) do { \ + rcu_read_lock(); \ + printk_ratelimited(fmt, __VA_ARGS__); \ + rcu_read_unlock(); \ +} while (0) + +#define rcu_str_deref(rcu_str) ({ \ + struct rcu_string *__str = rcu_dereference(rcu_str); \ + __str->str; \ +}) + +#endif diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c new file mode 100644 index 0000000000..1ea5bfb887 --- /dev/null +++ b/fs/btrfs/ref-verify.c @@ -0,0 +1,1028 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2014 Facebook. All rights reserved. + */ + +#include +#include +#include "messages.h" +#include "ctree.h" +#include "disk-io.h" +#include "locking.h" +#include "delayed-ref.h" +#include "ref-verify.h" +#include "fs.h" +#include "accessors.h" + +/* + * Used to keep track the roots and number of refs each root has for a given + * bytenr. This just tracks the number of direct references, no shared + * references. + */ +struct root_entry { + u64 root_objectid; + u64 num_refs; + struct rb_node node; +}; + +/* + * These are meant to represent what should exist in the extent tree, these can + * be used to verify the extent tree is consistent as these should all match + * what the extent tree says. + */ +struct ref_entry { + u64 root_objectid; + u64 parent; + u64 owner; + u64 offset; + u64 num_refs; + struct rb_node node; +}; + +#define MAX_TRACE 16 + +/* + * Whenever we add/remove a reference we record the action. The action maps + * back to the delayed ref action. We hold the ref we are changing in the + * action so we can account for the history properly, and we record the root we + * were called with since it could be different from ref_root. We also store + * stack traces because that's how I roll. + */ +struct ref_action { + int action; + u64 root; + struct ref_entry ref; + struct list_head list; + unsigned long trace[MAX_TRACE]; + unsigned int trace_len; +}; + +/* + * One of these for every block we reference, it holds the roots and references + * to it as well as all of the ref actions that have occurred to it. We never + * free it until we unmount the file system in order to make sure re-allocations + * are happening properly. + */ +struct block_entry { + u64 bytenr; + u64 len; + u64 num_refs; + int metadata; + int from_disk; + struct rb_root roots; + struct rb_root refs; + struct rb_node node; + struct list_head actions; +}; + +static struct block_entry *insert_block_entry(struct rb_root *root, + struct block_entry *be) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct block_entry *entry; + + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct block_entry, node); + if (entry->bytenr > be->bytenr) + p = &(*p)->rb_left; + else if (entry->bytenr < be->bytenr) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(&be->node, parent_node, p); + rb_insert_color(&be->node, root); + return NULL; +} + +static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr) +{ + struct rb_node *n; + struct block_entry *entry = NULL; + + n = root->rb_node; + while (n) { + entry = rb_entry(n, struct block_entry, node); + if (entry->bytenr < bytenr) + n = n->rb_right; + else if (entry->bytenr > bytenr) + n = n->rb_left; + else + return entry; + } + return NULL; +} + +static struct root_entry *insert_root_entry(struct rb_root *root, + struct root_entry *re) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct root_entry *entry; + + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct root_entry, node); + if (entry->root_objectid > re->root_objectid) + p = &(*p)->rb_left; + else if (entry->root_objectid < re->root_objectid) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(&re->node, parent_node, p); + rb_insert_color(&re->node, root); + return NULL; + +} + +static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2) +{ + if (ref1->root_objectid < ref2->root_objectid) + return -1; + if (ref1->root_objectid > ref2->root_objectid) + return 1; + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + if (ref1->owner < ref2->owner) + return -1; + if (ref1->owner > ref2->owner) + return 1; + if (ref1->offset < ref2->offset) + return -1; + if (ref1->offset > ref2->offset) + return 1; + return 0; +} + +static struct ref_entry *insert_ref_entry(struct rb_root *root, + struct ref_entry *ref) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct ref_entry *entry; + int cmp; + + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct ref_entry, node); + cmp = comp_refs(entry, ref); + if (cmp > 0) + p = &(*p)->rb_left; + else if (cmp < 0) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(&ref->node, parent_node, p); + rb_insert_color(&ref->node, root); + return NULL; + +} + +static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid) +{ + struct rb_node *n; + struct root_entry *entry = NULL; + + n = root->rb_node; + while (n) { + entry = rb_entry(n, struct root_entry, node); + if (entry->root_objectid < objectid) + n = n->rb_right; + else if (entry->root_objectid > objectid) + n = n->rb_left; + else + return entry; + } + return NULL; +} + +#ifdef CONFIG_STACKTRACE +static void __save_stack_trace(struct ref_action *ra) +{ + ra->trace_len = stack_trace_save(ra->trace, MAX_TRACE, 2); +} + +static void __print_stack_trace(struct btrfs_fs_info *fs_info, + struct ref_action *ra) +{ + if (ra->trace_len == 0) { + btrfs_err(fs_info, " ref-verify: no stacktrace"); + return; + } + stack_trace_print(ra->trace, ra->trace_len, 2); +} +#else +static inline void __save_stack_trace(struct ref_action *ra) +{ +} + +static inline void __print_stack_trace(struct btrfs_fs_info *fs_info, + struct ref_action *ra) +{ + btrfs_err(fs_info, " ref-verify: no stacktrace support"); +} +#endif + +static void free_block_entry(struct block_entry *be) +{ + struct root_entry *re; + struct ref_entry *ref; + struct ref_action *ra; + struct rb_node *n; + + while ((n = rb_first(&be->roots))) { + re = rb_entry(n, struct root_entry, node); + rb_erase(&re->node, &be->roots); + kfree(re); + } + + while((n = rb_first(&be->refs))) { + ref = rb_entry(n, struct ref_entry, node); + rb_erase(&ref->node, &be->refs); + kfree(ref); + } + + while (!list_empty(&be->actions)) { + ra = list_first_entry(&be->actions, struct ref_action, + list); + list_del(&ra->list); + kfree(ra); + } + kfree(be); +} + +static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info, + u64 bytenr, u64 len, + u64 root_objectid) +{ + struct block_entry *be = NULL, *exist; + struct root_entry *re = NULL; + + re = kzalloc(sizeof(struct root_entry), GFP_NOFS); + be = kzalloc(sizeof(struct block_entry), GFP_NOFS); + if (!be || !re) { + kfree(re); + kfree(be); + return ERR_PTR(-ENOMEM); + } + be->bytenr = bytenr; + be->len = len; + + re->root_objectid = root_objectid; + re->num_refs = 0; + + spin_lock(&fs_info->ref_verify_lock); + exist = insert_block_entry(&fs_info->block_tree, be); + if (exist) { + if (root_objectid) { + struct root_entry *exist_re; + + exist_re = insert_root_entry(&exist->roots, re); + if (exist_re) + kfree(re); + } else { + kfree(re); + } + kfree(be); + return exist; + } + + be->num_refs = 0; + be->metadata = 0; + be->from_disk = 0; + be->roots = RB_ROOT; + be->refs = RB_ROOT; + INIT_LIST_HEAD(&be->actions); + if (root_objectid) + insert_root_entry(&be->roots, re); + else + kfree(re); + return be; +} + +static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root, + u64 parent, u64 bytenr, int level) +{ + struct block_entry *be; + struct root_entry *re; + struct ref_entry *ref = NULL, *exist; + + ref = kmalloc(sizeof(struct ref_entry), GFP_NOFS); + if (!ref) + return -ENOMEM; + + if (parent) + ref->root_objectid = 0; + else + ref->root_objectid = ref_root; + ref->parent = parent; + ref->owner = level; + ref->offset = 0; + ref->num_refs = 1; + + be = add_block_entry(fs_info, bytenr, fs_info->nodesize, ref_root); + if (IS_ERR(be)) { + kfree(ref); + return PTR_ERR(be); + } + be->num_refs++; + be->from_disk = 1; + be->metadata = 1; + + if (!parent) { + ASSERT(ref_root); + re = lookup_root_entry(&be->roots, ref_root); + ASSERT(re); + re->num_refs++; + } + exist = insert_ref_entry(&be->refs, ref); + if (exist) { + exist->num_refs++; + kfree(ref); + } + spin_unlock(&fs_info->ref_verify_lock); + + return 0; +} + +static int add_shared_data_ref(struct btrfs_fs_info *fs_info, + u64 parent, u32 num_refs, u64 bytenr, + u64 num_bytes) +{ + struct block_entry *be; + struct ref_entry *ref; + + ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS); + if (!ref) + return -ENOMEM; + be = add_block_entry(fs_info, bytenr, num_bytes, 0); + if (IS_ERR(be)) { + kfree(ref); + return PTR_ERR(be); + } + be->num_refs += num_refs; + + ref->parent = parent; + ref->num_refs = num_refs; + if (insert_ref_entry(&be->refs, ref)) { + spin_unlock(&fs_info->ref_verify_lock); + btrfs_err(fs_info, "existing shared ref when reading from disk?"); + kfree(ref); + return -EINVAL; + } + spin_unlock(&fs_info->ref_verify_lock); + return 0; +} + +static int add_extent_data_ref(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, + struct btrfs_extent_data_ref *dref, + u64 bytenr, u64 num_bytes) +{ + struct block_entry *be; + struct ref_entry *ref; + struct root_entry *re; + u64 ref_root = btrfs_extent_data_ref_root(leaf, dref); + u64 owner = btrfs_extent_data_ref_objectid(leaf, dref); + u64 offset = btrfs_extent_data_ref_offset(leaf, dref); + u32 num_refs = btrfs_extent_data_ref_count(leaf, dref); + + ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS); + if (!ref) + return -ENOMEM; + be = add_block_entry(fs_info, bytenr, num_bytes, ref_root); + if (IS_ERR(be)) { + kfree(ref); + return PTR_ERR(be); + } + be->num_refs += num_refs; + + ref->parent = 0; + ref->owner = owner; + ref->root_objectid = ref_root; + ref->offset = offset; + ref->num_refs = num_refs; + if (insert_ref_entry(&be->refs, ref)) { + spin_unlock(&fs_info->ref_verify_lock); + btrfs_err(fs_info, "existing ref when reading from disk?"); + kfree(ref); + return -EINVAL; + } + + re = lookup_root_entry(&be->roots, ref_root); + if (!re) { + spin_unlock(&fs_info->ref_verify_lock); + btrfs_err(fs_info, "missing root in new block entry?"); + return -EINVAL; + } + re->num_refs += num_refs; + spin_unlock(&fs_info->ref_verify_lock); + return 0; +} + +static int process_extent_item(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_key *key, + int slot, int *tree_block_level) +{ + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct extent_buffer *leaf = path->nodes[0]; + u32 item_size = btrfs_item_size(leaf, slot); + unsigned long end, ptr; + u64 offset, flags, count; + int type, ret; + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + + if ((key->type == BTRFS_EXTENT_ITEM_KEY) && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + + info = (struct btrfs_tree_block_info *)(ei + 1); + *tree_block_level = btrfs_tree_block_level(leaf, info); + iref = (struct btrfs_extent_inline_ref *)(info + 1); + } else { + if (key->type == BTRFS_METADATA_ITEM_KEY) + *tree_block_level = key->offset; + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + while (ptr < end) { + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(leaf, iref); + offset = btrfs_extent_inline_ref_offset(leaf, iref); + switch (type) { + case BTRFS_TREE_BLOCK_REF_KEY: + ret = add_tree_block(fs_info, offset, 0, key->objectid, + *tree_block_level); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = add_tree_block(fs_info, 0, offset, key->objectid, + *tree_block_level); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + ret = add_extent_data_ref(fs_info, leaf, dref, + key->objectid, key->offset); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = (struct btrfs_shared_data_ref *)(iref + 1); + count = btrfs_shared_data_ref_count(leaf, sref); + ret = add_shared_data_ref(fs_info, offset, count, + key->objectid, key->offset); + break; + default: + btrfs_err(fs_info, "invalid key type in iref"); + ret = -EINVAL; + break; + } + if (ret) + break; + ptr += btrfs_extent_inline_ref_size(type); + } + return ret; +} + +static int process_leaf(struct btrfs_root *root, + struct btrfs_path *path, u64 *bytenr, u64 *num_bytes, + int *tree_block_level) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + u32 count; + int i = 0, ret = 0; + struct btrfs_key key; + int nritems = btrfs_header_nritems(leaf); + + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(leaf, &key, i); + switch (key.type) { + case BTRFS_EXTENT_ITEM_KEY: + *num_bytes = key.offset; + fallthrough; + case BTRFS_METADATA_ITEM_KEY: + *bytenr = key.objectid; + ret = process_extent_item(fs_info, path, &key, i, + tree_block_level); + break; + case BTRFS_TREE_BLOCK_REF_KEY: + ret = add_tree_block(fs_info, key.offset, 0, + key.objectid, *tree_block_level); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = add_tree_block(fs_info, 0, key.offset, + key.objectid, *tree_block_level); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = btrfs_item_ptr(leaf, i, + struct btrfs_extent_data_ref); + ret = add_extent_data_ref(fs_info, leaf, dref, *bytenr, + *num_bytes); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = btrfs_item_ptr(leaf, i, + struct btrfs_shared_data_ref); + count = btrfs_shared_data_ref_count(leaf, sref); + ret = add_shared_data_ref(fs_info, key.offset, count, + *bytenr, *num_bytes); + break; + default: + break; + } + if (ret) + break; + } + return ret; +} + +/* Walk down to the leaf from the given level */ +static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, + int level, u64 *bytenr, u64 *num_bytes, + int *tree_block_level) +{ + struct extent_buffer *eb; + int ret = 0; + + while (level >= 0) { + if (level) { + eb = btrfs_read_node_slot(path->nodes[level], + path->slots[level]); + if (IS_ERR(eb)) + return PTR_ERR(eb); + btrfs_tree_read_lock(eb); + path->nodes[level-1] = eb; + path->slots[level-1] = 0; + path->locks[level-1] = BTRFS_READ_LOCK; + } else { + ret = process_leaf(root, path, bytenr, num_bytes, + tree_block_level); + if (ret) + break; + } + level--; + } + return ret; +} + +/* Walk up to the next node that needs to be processed */ +static int walk_up_tree(struct btrfs_path *path, int *level) +{ + int l; + + for (l = 0; l < BTRFS_MAX_LEVEL; l++) { + if (!path->nodes[l]) + continue; + if (l) { + path->slots[l]++; + if (path->slots[l] < + btrfs_header_nritems(path->nodes[l])) { + *level = l; + return 0; + } + } + btrfs_tree_unlock_rw(path->nodes[l], path->locks[l]); + free_extent_buffer(path->nodes[l]); + path->nodes[l] = NULL; + path->slots[l] = 0; + path->locks[l] = 0; + } + + return 1; +} + +static void dump_ref_action(struct btrfs_fs_info *fs_info, + struct ref_action *ra) +{ + btrfs_err(fs_info, +" Ref action %d, root %llu, ref_root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu", + ra->action, ra->root, ra->ref.root_objectid, ra->ref.parent, + ra->ref.owner, ra->ref.offset, ra->ref.num_refs); + __print_stack_trace(fs_info, ra); +} + +/* + * Dumps all the information from the block entry to printk, it's going to be + * awesome. + */ +static void dump_block_entry(struct btrfs_fs_info *fs_info, + struct block_entry *be) +{ + struct ref_entry *ref; + struct root_entry *re; + struct ref_action *ra; + struct rb_node *n; + + btrfs_err(fs_info, +"dumping block entry [%llu %llu], num_refs %llu, metadata %d, from disk %d", + be->bytenr, be->len, be->num_refs, be->metadata, + be->from_disk); + + for (n = rb_first(&be->refs); n; n = rb_next(n)) { + ref = rb_entry(n, struct ref_entry, node); + btrfs_err(fs_info, +" ref root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu", + ref->root_objectid, ref->parent, ref->owner, + ref->offset, ref->num_refs); + } + + for (n = rb_first(&be->roots); n; n = rb_next(n)) { + re = rb_entry(n, struct root_entry, node); + btrfs_err(fs_info, " root entry %llu, num_refs %llu", + re->root_objectid, re->num_refs); + } + + list_for_each_entry(ra, &be->actions, list) + dump_ref_action(fs_info, ra); +} + +/* + * btrfs_ref_tree_mod: called when we modify a ref for a bytenr + * + * This will add an action item to the given bytenr and do sanity checks to make + * sure we haven't messed something up. If we are making a new allocation and + * this block entry has history we will delete all previous actions as long as + * our sanity checks pass as they are no longer needed. + */ +int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, + struct btrfs_ref *generic_ref) +{ + struct ref_entry *ref = NULL, *exist; + struct ref_action *ra = NULL; + struct block_entry *be = NULL; + struct root_entry *re = NULL; + int action = generic_ref->action; + int ret = 0; + bool metadata; + u64 bytenr = generic_ref->bytenr; + u64 num_bytes = generic_ref->len; + u64 parent = generic_ref->parent; + u64 ref_root = 0; + u64 owner = 0; + u64 offset = 0; + + if (!btrfs_test_opt(fs_info, REF_VERIFY)) + return 0; + + if (generic_ref->type == BTRFS_REF_METADATA) { + if (!parent) + ref_root = generic_ref->tree_ref.owning_root; + owner = generic_ref->tree_ref.level; + } else if (!parent) { + ref_root = generic_ref->data_ref.owning_root; + owner = generic_ref->data_ref.ino; + offset = generic_ref->data_ref.offset; + } + metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + + ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS); + ra = kmalloc(sizeof(struct ref_action), GFP_NOFS); + if (!ra || !ref) { + kfree(ref); + kfree(ra); + ret = -ENOMEM; + goto out; + } + + ref->parent = parent; + ref->owner = owner; + ref->root_objectid = ref_root; + ref->offset = offset; + ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1; + + memcpy(&ra->ref, ref, sizeof(struct ref_entry)); + /* + * Save the extra info from the delayed ref in the ref action to make it + * easier to figure out what is happening. The real ref's we add to the + * ref tree need to reflect what we save on disk so it matches any + * on-disk refs we pre-loaded. + */ + ra->ref.owner = owner; + ra->ref.offset = offset; + ra->ref.root_objectid = ref_root; + __save_stack_trace(ra); + + INIT_LIST_HEAD(&ra->list); + ra->action = action; + ra->root = generic_ref->real_root; + + /* + * This is an allocation, preallocate the block_entry in case we haven't + * used it before. + */ + ret = -EINVAL; + if (action == BTRFS_ADD_DELAYED_EXTENT) { + /* + * For subvol_create we'll just pass in whatever the parent root + * is and the new root objectid, so let's not treat the passed + * in root as if it really has a ref for this bytenr. + */ + be = add_block_entry(fs_info, bytenr, num_bytes, ref_root); + if (IS_ERR(be)) { + kfree(ref); + kfree(ra); + ret = PTR_ERR(be); + goto out; + } + be->num_refs++; + if (metadata) + be->metadata = 1; + + if (be->num_refs != 1) { + btrfs_err(fs_info, + "re-allocated a block that still has references to it!"); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ref); + kfree(ra); + goto out_unlock; + } + + while (!list_empty(&be->actions)) { + struct ref_action *tmp; + + tmp = list_first_entry(&be->actions, struct ref_action, + list); + list_del(&tmp->list); + kfree(tmp); + } + } else { + struct root_entry *tmp; + + if (!parent) { + re = kmalloc(sizeof(struct root_entry), GFP_NOFS); + if (!re) { + kfree(ref); + kfree(ra); + ret = -ENOMEM; + goto out; + } + /* + * This is the root that is modifying us, so it's the + * one we want to lookup below when we modify the + * re->num_refs. + */ + ref_root = generic_ref->real_root; + re->root_objectid = generic_ref->real_root; + re->num_refs = 0; + } + + spin_lock(&fs_info->ref_verify_lock); + be = lookup_block_entry(&fs_info->block_tree, bytenr); + if (!be) { + btrfs_err(fs_info, +"trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!", + action, bytenr, num_bytes); + dump_ref_action(fs_info, ra); + kfree(ref); + kfree(ra); + kfree(re); + goto out_unlock; + } else if (be->num_refs == 0) { + btrfs_err(fs_info, + "trying to do action %d for a bytenr that has 0 total references", + action); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ref); + kfree(ra); + kfree(re); + goto out_unlock; + } + + if (!parent) { + tmp = insert_root_entry(&be->roots, re); + if (tmp) { + kfree(re); + re = tmp; + } + } + } + + exist = insert_ref_entry(&be->refs, ref); + if (exist) { + if (action == BTRFS_DROP_DELAYED_REF) { + if (exist->num_refs == 0) { + btrfs_err(fs_info, +"dropping a ref for a existing root that doesn't have a ref on the block"); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ref); + kfree(ra); + goto out_unlock; + } + exist->num_refs--; + if (exist->num_refs == 0) { + rb_erase(&exist->node, &be->refs); + kfree(exist); + } + } else if (!be->metadata) { + exist->num_refs++; + } else { + btrfs_err(fs_info, +"attempting to add another ref for an existing ref on a tree block"); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ref); + kfree(ra); + goto out_unlock; + } + kfree(ref); + } else { + if (action == BTRFS_DROP_DELAYED_REF) { + btrfs_err(fs_info, +"dropping a ref for a root that doesn't have a ref on the block"); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ref); + kfree(ra); + goto out_unlock; + } + } + + if (!parent && !re) { + re = lookup_root_entry(&be->roots, ref_root); + if (!re) { + /* + * This shouldn't happen because we will add our re + * above when we lookup the be with !parent, but just in + * case catch this case so we don't panic because I + * didn't think of some other corner case. + */ + btrfs_err(fs_info, "failed to find root %llu for %llu", + generic_ref->real_root, be->bytenr); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ra); + goto out_unlock; + } + } + if (action == BTRFS_DROP_DELAYED_REF) { + if (re) + re->num_refs--; + be->num_refs--; + } else if (action == BTRFS_ADD_DELAYED_REF) { + be->num_refs++; + if (re) + re->num_refs++; + } + list_add_tail(&ra->list, &be->actions); + ret = 0; +out_unlock: + spin_unlock(&fs_info->ref_verify_lock); +out: + if (ret) { + btrfs_free_ref_cache(fs_info); + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + } + return ret; +} + +/* Free up the ref cache */ +void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info) +{ + struct block_entry *be; + struct rb_node *n; + + if (!btrfs_test_opt(fs_info, REF_VERIFY)) + return; + + spin_lock(&fs_info->ref_verify_lock); + while ((n = rb_first(&fs_info->block_tree))) { + be = rb_entry(n, struct block_entry, node); + rb_erase(&be->node, &fs_info->block_tree); + free_block_entry(be); + cond_resched_lock(&fs_info->ref_verify_lock); + } + spin_unlock(&fs_info->ref_verify_lock); +} + +void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, + u64 len) +{ + struct block_entry *be = NULL, *entry; + struct rb_node *n; + + if (!btrfs_test_opt(fs_info, REF_VERIFY)) + return; + + spin_lock(&fs_info->ref_verify_lock); + n = fs_info->block_tree.rb_node; + while (n) { + entry = rb_entry(n, struct block_entry, node); + if (entry->bytenr < start) { + n = n->rb_right; + } else if (entry->bytenr > start) { + n = n->rb_left; + } else { + be = entry; + break; + } + /* We want to get as close to start as possible */ + if (be == NULL || + (entry->bytenr < start && be->bytenr > start) || + (entry->bytenr < start && entry->bytenr > be->bytenr)) + be = entry; + } + + /* + * Could have an empty block group, maybe have something to check for + * this case to verify we were actually empty? + */ + if (!be) { + spin_unlock(&fs_info->ref_verify_lock); + return; + } + + n = &be->node; + while (n) { + be = rb_entry(n, struct block_entry, node); + n = rb_next(n); + if (be->bytenr < start && be->bytenr + be->len > start) { + btrfs_err(fs_info, + "block entry overlaps a block group [%llu,%llu]!", + start, len); + dump_block_entry(fs_info, be); + continue; + } + if (be->bytenr < start) + continue; + if (be->bytenr >= start + len) + break; + if (be->bytenr + be->len > start + len) { + btrfs_err(fs_info, + "block entry overlaps a block group [%llu,%llu]!", + start, len); + dump_block_entry(fs_info, be); + } + rb_erase(&be->node, &fs_info->block_tree); + free_block_entry(be); + } + spin_unlock(&fs_info->ref_verify_lock); +} + +/* Walk down all roots and build the ref tree, meant to be called at mount */ +int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *extent_root; + struct btrfs_path *path; + struct extent_buffer *eb; + int tree_block_level = 0; + u64 bytenr = 0, num_bytes = 0; + int ret, level; + + if (!btrfs_test_opt(fs_info, REF_VERIFY)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + extent_root = btrfs_extent_root(fs_info, 0); + eb = btrfs_read_lock_root_node(extent_root); + level = btrfs_header_level(eb); + path->nodes[level] = eb; + path->slots[level] = 0; + path->locks[level] = BTRFS_READ_LOCK; + + while (1) { + /* + * We have to keep track of the bytenr/num_bytes we last hit + * because we could have run out of space for an inline ref, and + * would have had to added a ref key item which may appear on a + * different leaf from the original extent item. + */ + ret = walk_down_tree(extent_root, path, level, + &bytenr, &num_bytes, &tree_block_level); + if (ret) + break; + ret = walk_up_tree(path, &level); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + break; + } + } + if (ret) { + btrfs_free_ref_cache(fs_info); + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + } + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h new file mode 100644 index 0000000000..855de37719 --- /dev/null +++ b/fs/btrfs/ref-verify.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2014 Facebook. All rights reserved. + */ + +#ifndef BTRFS_REF_VERIFY_H +#define BTRFS_REF_VERIFY_H + +#ifdef CONFIG_BTRFS_FS_REF_VERIFY +int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info); +void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info); +int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, + struct btrfs_ref *generic_ref); +void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, + u64 len); + +static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info) +{ + spin_lock_init(&fs_info->ref_verify_lock); + fs_info->block_tree = RB_ROOT; +} +#else +static inline int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) +{ + return 0; +} + +static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info) +{ +} + +static inline int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, + struct btrfs_ref *generic_ref) +{ + return 0; +} + +static inline void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, + u64 start, u64 len) +{ +} + +static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info) +{ +} + +#endif /* CONFIG_BTRFS_FS_REF_VERIFY */ + +#endif diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c new file mode 100644 index 0000000000..65d2bd6910 --- /dev/null +++ b/fs/btrfs/reflink.c @@ -0,0 +1,935 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "ctree.h" +#include "fs.h" +#include "messages.h" +#include "compression.h" +#include "delalloc-space.h" +#include "disk-io.h" +#include "reflink.h" +#include "transaction.h" +#include "subpage.h" +#include "accessors.h" +#include "file-item.h" +#include "file.h" +#include "super.h" + +#define BTRFS_MAX_DEDUPE_LEN SZ_16M + +static int clone_finish_inode_update(struct btrfs_trans_handle *trans, + struct inode *inode, + u64 endoff, + const u64 destoff, + const u64 olen, + int no_time_update) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + inode_inc_iversion(inode); + if (!no_time_update) { + inode->i_mtime = inode_set_ctime_current(inode); + } + /* + * We round up to the block size at eof when determining which + * extents to clone above, but shouldn't round up the file size. + */ + if (endoff > destoff + olen) + endoff = destoff + olen; + if (endoff > inode->i_size) { + i_size_write(inode, endoff); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); + } + + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out; + } + ret = btrfs_end_transaction(trans); +out: + return ret; +} + +static int copy_inline_to_page(struct btrfs_inode *inode, + const u64 file_offset, + char *inline_data, + const u64 size, + const u64 datal, + const u8 comp_type) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 block_size = fs_info->sectorsize; + const u64 range_end = file_offset + block_size - 1; + const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); + char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); + struct extent_changeset *data_reserved = NULL; + struct page *page = NULL; + struct address_space *mapping = inode->vfs_inode.i_mapping; + int ret; + + ASSERT(IS_ALIGNED(file_offset, block_size)); + + /* + * We have flushed and locked the ranges of the source and destination + * inodes, we also have locked the inodes, so we are safe to do a + * reservation here. Also we must not do the reservation while holding + * a transaction open, otherwise we would deadlock. + */ + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, + block_size); + if (ret) + goto out; + + page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT, + btrfs_alloc_write_mask(mapping)); + if (!page) { + ret = -ENOMEM; + goto out_unlock; + } + + ret = set_page_extent_mapped(page); + if (ret < 0) + goto out_unlock; + + clear_extent_bit(&inode->io_tree, file_offset, range_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + NULL); + ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); + if (ret) + goto out_unlock; + + /* + * After dirtying the page our caller will need to start a transaction, + * and if we are low on metadata free space, that can cause flushing of + * delalloc for all inodes in order to get metadata space released. + * However we are holding the range locked for the whole duration of + * the clone/dedupe operation, so we may deadlock if that happens and no + * other task releases enough space. So mark this inode as not being + * possible to flush to avoid such deadlock. We will clear that flag + * when we finish cloning all extents, since a transaction is started + * after finding each extent to clone. + */ + set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); + + if (comp_type == BTRFS_COMPRESS_NONE) { + memcpy_to_page(page, offset_in_page(file_offset), data_start, + datal); + } else { + ret = btrfs_decompress(comp_type, data_start, page, + offset_in_page(file_offset), + inline_size, datal); + if (ret) + goto out_unlock; + flush_dcache_page(page); + } + + /* + * If our inline data is smaller then the block/page size, then the + * remaining of the block/page is equivalent to zeroes. We had something + * like the following done: + * + * $ xfs_io -f -c "pwrite -S 0xab 0 500" file + * $ sync # (or fsync) + * $ xfs_io -c "falloc 0 4K" file + * $ xfs_io -c "pwrite -S 0xcd 4K 4K" + * + * So what's in the range [500, 4095] corresponds to zeroes. + */ + if (datal < block_size) + memzero_page(page, datal, block_size - datal); + + btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); + btrfs_page_clear_checked(fs_info, page, file_offset, block_size); + btrfs_page_set_dirty(fs_info, page, file_offset, block_size); +out_unlock: + if (page) { + unlock_page(page); + put_page(page); + } + if (ret) + btrfs_delalloc_release_space(inode, data_reserved, file_offset, + block_size, true); + btrfs_delalloc_release_extents(inode, block_size); +out: + extent_changeset_free(data_reserved); + + return ret; +} + +/* + * Deal with cloning of inline extents. We try to copy the inline extent from + * the source inode to destination inode when possible. When not possible we + * copy the inline extent's data into the respective page of the inode. + */ +static int clone_copy_inline_extent(struct inode *dst, + struct btrfs_path *path, + struct btrfs_key *new_key, + const u64 drop_start, + const u64 datal, + const u64 size, + const u8 comp_type, + char *inline_data, + struct btrfs_trans_handle **trans_out) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); + struct btrfs_root *root = BTRFS_I(dst)->root; + const u64 aligned_end = ALIGN(new_key->offset + datal, + fs_info->sectorsize); + struct btrfs_trans_handle *trans = NULL; + struct btrfs_drop_extents_args drop_args = { 0 }; + int ret; + struct btrfs_key key; + + if (new_key->offset > 0) { + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); + goto out; + } + + key.objectid = btrfs_ino(BTRFS_I(dst)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + goto copy_inline_extent; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == btrfs_ino(BTRFS_I(dst)) && + key.type == BTRFS_EXTENT_DATA_KEY) { + /* + * There's an implicit hole at file offset 0, copy the + * inline extent's data to the page. + */ + ASSERT(key.offset > 0); + goto copy_to_page; + } + } else if (i_size_read(dst) <= datal) { + struct btrfs_file_extent_item *ei; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + /* + * If it's an inline extent replace it with the source inline + * extent, otherwise copy the source inline extent data into + * the respective page at the destination inode. + */ + if (btrfs_file_extent_type(path->nodes[0], ei) == + BTRFS_FILE_EXTENT_INLINE) + goto copy_inline_extent; + + goto copy_to_page; + } + +copy_inline_extent: + /* + * We have no extent items, or we have an extent at offset 0 which may + * or may not be inlined. All these cases are dealt the same way. + */ + if (i_size_read(dst) > datal) { + /* + * At the destination offset 0 we have either a hole, a regular + * extent or an inline extent larger then the one we want to + * clone. Deal with all these cases by copying the inline extent + * data into the respective page at the destination inode. + */ + goto copy_to_page; + } + + /* + * Release path before starting a new transaction so we don't hold locks + * that would confuse lockdep. + */ + btrfs_release_path(path); + /* + * If we end up here it means were copy the inline extent into a leaf + * of the destination inode. We know we will drop or adjust at most one + * extent item in the destination root. + * + * 1 unit - adjusting old extent (we may have to split it) + * 1 unit - add new extent + * 1 unit - inode update + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + drop_args.path = path; + drop_args.start = drop_start; + drop_args.end = aligned_end; + drop_args.drop_cache = true; + ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args); + if (ret) + goto out; + ret = btrfs_insert_empty_item(trans, root, path, new_key, size); + if (ret) + goto out; + + write_extent_buffer(path->nodes[0], inline_data, + btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]), + size); + btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); + btrfs_set_inode_full_sync(BTRFS_I(dst)); + ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); +out: + if (!ret && !trans) { + /* + * No transaction here means we copied the inline extent into a + * page of the destination inode. + * + * 1 unit to update inode item + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + } + } + if (ret && trans) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + } + if (!ret) + *trans_out = trans; + + return ret; + +copy_to_page: + /* + * Release our path because we don't need it anymore and also because + * copy_inline_to_page() needs to reserve data and metadata, which may + * need to flush delalloc when we are low on available space and + * therefore cause a deadlock if writeback of an inline extent needs to + * write to the same leaf or an ordered extent completion needs to write + * to the same leaf. + */ + btrfs_release_path(path); + + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); + goto out; +} + +/* + * Clone a range from inode file to another. + * + * @src: Inode to clone from + * @inode: Inode to clone to + * @off: Offset within source to start clone from + * @olen: Original length, passed by user, of range to clone + * @olen_aligned: Block-aligned value of olen + * @destoff: Offset within @inode to start clone + * @no_time_update: Whether to update mtime/ctime on the target inode + */ +static int btrfs_clone(struct inode *src, struct inode *inode, + const u64 off, const u64 olen, const u64 olen_aligned, + const u64 destoff, int no_time_update) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_path *path = NULL; + struct extent_buffer *leaf; + struct btrfs_trans_handle *trans; + char *buf = NULL; + struct btrfs_key key; + u32 nritems; + int slot; + int ret; + const u64 len = olen_aligned; + u64 last_dest_end = destoff; + u64 prev_extent_end = off; + + ret = -ENOMEM; + buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); + if (!buf) + return ret; + + path = btrfs_alloc_path(); + if (!path) { + kvfree(buf); + return ret; + } + + path->reada = READA_FORWARD; + /* Clone data */ + key.objectid = btrfs_ino(BTRFS_I(src)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = off; + + while (1) { + struct btrfs_file_extent_item *extent; + u64 extent_gen; + int type; + u32 size; + struct btrfs_key new_key; + u64 disko = 0, diskl = 0; + u64 datao = 0, datal = 0; + u8 comp; + u64 drop_start; + + /* Note the key will change type as we walk through the tree */ + ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, + 0, 0); + if (ret < 0) + goto out; + /* + * First search, if no extent item that starts at offset off was + * found but the previous item is an extent item, it's possible + * it might overlap our target range, therefore process it. + */ + if (key.offset == off && ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0] - 1); + if (key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + nritems = btrfs_header_nritems(path->nodes[0]); +process_slot: + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(BTRFS_I(src)->root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + nritems = btrfs_header_nritems(path->nodes[0]); + } + leaf = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.type > BTRFS_EXTENT_DATA_KEY || + key.objectid != btrfs_ino(BTRFS_I(src))) + break; + + ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + extent_gen = btrfs_file_extent_generation(leaf, extent); + comp = btrfs_file_extent_compression(leaf, extent); + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + disko = btrfs_file_extent_disk_bytenr(leaf, extent); + diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); + datao = btrfs_file_extent_offset(leaf, extent); + datal = btrfs_file_extent_num_bytes(leaf, extent); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + /* Take upper bound, may be compressed */ + datal = btrfs_file_extent_ram_bytes(leaf, extent); + } + + /* + * The first search might have left us at an extent item that + * ends before our target range's start, can happen if we have + * holes and NO_HOLES feature enabled. + * + * Subsequent searches may leave us on a file range we have + * processed before - this happens due to a race with ordered + * extent completion for a file range that is outside our source + * range, but that range was part of a file extent item that + * also covered a leading part of our source range. + */ + if (key.offset + datal <= prev_extent_end) { + path->slots[0]++; + goto process_slot; + } else if (key.offset >= off + len) { + break; + } + + prev_extent_end = key.offset + datal; + size = btrfs_item_size(leaf, slot); + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), + size); + + btrfs_release_path(path); + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.objectid = btrfs_ino(BTRFS_I(inode)); + if (off <= key.offset) + new_key.offset = key.offset + destoff - off; + else + new_key.offset = destoff; + + /* + * Deal with a hole that doesn't have an extent item that + * represents it (NO_HOLES feature enabled). + * This hole is either in the middle of the cloning range or at + * the beginning (fully overlaps it or partially overlaps it). + */ + if (new_key.offset != last_dest_end) + drop_start = last_dest_end; + else + drop_start = new_key.offset; + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + struct btrfs_replace_extent_info clone_info; + + /* + * a | --- range to clone ---| b + * | ------------- extent ------------- | + */ + + /* Subtract range b */ + if (key.offset + datal > off + len) + datal = off + len - key.offset; + + /* Subtract range a */ + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; + } + + clone_info.disk_offset = disko; + clone_info.disk_len = diskl; + clone_info.data_offset = datao; + clone_info.data_len = datal; + clone_info.file_offset = new_key.offset; + clone_info.extent_buf = buf; + clone_info.is_new_extent = false; + clone_info.update_times = !no_time_update; + ret = btrfs_replace_file_extents(BTRFS_I(inode), path, + drop_start, new_key.offset + datal - 1, + &clone_info, &trans); + if (ret) + goto out; + } else { + ASSERT(type == BTRFS_FILE_EXTENT_INLINE); + /* + * Inline extents always have to start at file offset 0 + * and can never be bigger then the sector size. We can + * never clone only parts of an inline extent, since all + * reflink operations must start at a sector size aligned + * offset, and the length must be aligned too or end at + * the i_size (which implies the whole inlined data). + */ + ASSERT(key.offset == 0); + ASSERT(datal <= fs_info->sectorsize); + if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) || + WARN_ON(key.offset != 0) || + WARN_ON(datal > fs_info->sectorsize)) { + ret = -EUCLEAN; + goto out; + } + + ret = clone_copy_inline_extent(inode, path, &new_key, + drop_start, datal, size, + comp, buf, &trans); + if (ret) + goto out; + } + + btrfs_release_path(path); + + /* + * Whenever we share an extent we update the last_reflink_trans + * of each inode to the current transaction. This is needed to + * make sure fsync does not log multiple checksum items with + * overlapping ranges (because some extent items might refer + * only to sections of the original extent). For the destination + * inode we do this regardless of the generation of the extents + * or even if they are inline extents or explicit holes, to make + * sure a full fsync does not skip them. For the source inode, + * we only need to update last_reflink_trans in case it's a new + * extent that is not a hole or an inline extent, to deal with + * the checksums problem on fsync. + */ + if (extent_gen == trans->transid && disko > 0) + BTRFS_I(src)->last_reflink_trans = trans->transid; + + BTRFS_I(inode)->last_reflink_trans = trans->transid; + + last_dest_end = ALIGN(new_key.offset + datal, + fs_info->sectorsize); + ret = clone_finish_inode_update(trans, inode, last_dest_end, + destoff, olen, no_time_update); + if (ret) + goto out; + if (new_key.offset + datal >= destoff + len) + break; + + btrfs_release_path(path); + key.offset = prev_extent_end; + + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + + cond_resched(); + } + ret = 0; + + if (last_dest_end < destoff + len) { + /* + * We have an implicit hole that fully or partially overlaps our + * cloning range at its end. This means that we either have the + * NO_HOLES feature enabled or the implicit hole happened due to + * mixing buffered and direct IO writes against this file. + */ + btrfs_release_path(path); + + /* + * When using NO_HOLES and we are cloning a range that covers + * only a hole (no extents) into a range beyond the current + * i_size, punching a hole in the target range will not create + * an extent map defining a hole, because the range starts at or + * beyond current i_size. If the file previously had an i_size + * greater than the new i_size set by this clone operation, we + * need to make sure the next fsync is a full fsync, so that it + * detects and logs a hole covering a range from the current + * i_size to the new i_size. If the clone range covers extents, + * besides a hole, then we know the full sync flag was already + * set by previous calls to btrfs_replace_file_extents() that + * replaced file extent items. + */ + if (last_dest_end >= i_size_read(inode)) + btrfs_set_inode_full_sync(BTRFS_I(inode)); + + ret = btrfs_replace_file_extents(BTRFS_I(inode), path, + last_dest_end, destoff + len - 1, NULL, &trans); + if (ret) + goto out; + + ret = clone_finish_inode_update(trans, inode, destoff + len, + destoff, olen, no_time_update); + } + +out: + btrfs_free_path(path); + kvfree(buf); + clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags); + + return ret; +} + +static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL); +} + +static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + u64 range1_end = loff1 + len - 1; + u64 range2_end = loff2 + len - 1; + + if (inode1 < inode2) { + swap(inode1, inode2); + swap(loff1, loff2); + swap(range1_end, range2_end); + } else if (inode1 == inode2 && loff2 < loff1) { + swap(loff1, loff2); + swap(range1_end, range2_end); + } + + lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL); + lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL); + + btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end); + btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end); +} + +static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) +{ + if (inode1 < inode2) + swap(inode1, inode2); + down_write(&BTRFS_I(inode1)->i_mmap_lock); + down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING); +} + +static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) +{ + up_write(&BTRFS_I(inode1)->i_mmap_lock); + up_write(&BTRFS_I(inode2)->i_mmap_lock); +} + +static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, + struct inode *dst, u64 dst_loff) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; + const u64 bs = fs_info->sb->s_blocksize; + int ret; + + /* + * Lock destination range to serialize with concurrent readahead() and + * source range to serialize with relocation. + */ + btrfs_double_extent_lock(src, loff, dst, dst_loff, len); + ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); + btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + + btrfs_btree_balance_dirty(fs_info); + + return ret; +} + +static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, + struct inode *dst, u64 dst_loff) +{ + int ret = 0; + u64 i, tail_len, chunk_count; + struct btrfs_root *root_dst = BTRFS_I(dst)->root; + + spin_lock(&root_dst->root_item_lock); + if (root_dst->send_in_progress) { + btrfs_warn_rl(root_dst->fs_info, +"cannot deduplicate to root %llu while send operations are using it (%d in progress)", + root_dst->root_key.objectid, + root_dst->send_in_progress); + spin_unlock(&root_dst->root_item_lock); + return -EAGAIN; + } + root_dst->dedupe_in_progress++; + spin_unlock(&root_dst->root_item_lock); + + tail_len = olen % BTRFS_MAX_DEDUPE_LEN; + chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); + + for (i = 0; i < chunk_count; i++) { + ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, + dst, dst_loff); + if (ret) + goto out; + + loff += BTRFS_MAX_DEDUPE_LEN; + dst_loff += BTRFS_MAX_DEDUPE_LEN; + } + + if (tail_len > 0) + ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); +out: + spin_lock(&root_dst->root_item_lock); + root_dst->dedupe_in_progress--; + spin_unlock(&root_dst->root_item_lock); + + return ret; +} + +static noinline int btrfs_clone_files(struct file *file, struct file *file_src, + u64 off, u64 olen, u64 destoff) +{ + struct inode *inode = file_inode(file); + struct inode *src = file_inode(file_src); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + int ret; + int wb_ret; + u64 len = olen; + u64 bs = fs_info->sb->s_blocksize; + + /* + * VFS's generic_remap_file_range_prep() protects us from cloning the + * eof block into the middle of a file, which would result in corruption + * if the file size is not blocksize aligned. So we don't need to check + * for that case here. + */ + if (off + len == src->i_size) + len = ALIGN(src->i_size, bs) - off; + + if (destoff > inode->i_size) { + const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); + + ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff); + if (ret) + return ret; + /* + * We may have truncated the last block if the inode's size is + * not sector size aligned, so we need to wait for writeback to + * complete before proceeding further, otherwise we can race + * with cloning and attempt to increment a reference to an + * extent that no longer exists (writeback completed right after + * we found the previous extent covering eof and before we + * attempted to increment its reference count). + */ + ret = btrfs_wait_ordered_range(inode, wb_start, + destoff - wb_start); + if (ret) + return ret; + } + + /* + * Lock destination range to serialize with concurrent readahead() and + * source range to serialize with relocation. + */ + btrfs_double_extent_lock(src, off, inode, destoff, len); + ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); + btrfs_double_extent_unlock(src, off, inode, destoff, len); + + /* + * We may have copied an inline extent into a page of the destination + * range, so wait for writeback to complete before truncating pages + * from the page cache. This is a rare case. + */ + wb_ret = btrfs_wait_ordered_range(inode, destoff, len); + ret = ret ? ret : wb_ret; + /* + * Truncate page cache pages so that future reads will see the cloned + * data immediately and not the previous data. + */ + truncate_inode_pages_range(&inode->i_data, + round_down(destoff, PAGE_SIZE), + round_up(destoff + len, PAGE_SIZE) - 1); + + btrfs_btree_balance_dirty(fs_info); + + return ret; +} + +static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; + u64 wb_len; + int ret; + + if (!(remap_flags & REMAP_FILE_DEDUP)) { + struct btrfs_root *root_out = BTRFS_I(inode_out)->root; + + if (btrfs_root_readonly(root_out)) + return -EROFS; + + ASSERT(inode_in->i_sb == inode_out->i_sb); + } + + /* Don't make the dst file partly checksummed */ + if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { + return -EINVAL; + } + + /* + * Now that the inodes are locked, we need to start writeback ourselves + * and can not rely on the writeback from the VFS's generic helper + * generic_remap_file_range_prep() because: + * + * 1) For compression we must call filemap_fdatawrite_range() range + * twice (btrfs_fdatawrite_range() does it for us), and the generic + * helper only calls it once; + * + * 2) filemap_fdatawrite_range(), called by the generic helper only + * waits for the writeback to complete, i.e. for IO to be done, and + * not for the ordered extents to complete. We need to wait for them + * to complete so that new file extent items are in the fs tree. + */ + if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) + wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); + else + wb_len = ALIGN(*len, bs); + + /* + * Workaround to make sure NOCOW buffered write reach disk as NOCOW. + * + * Btrfs' back references do not have a block level granularity, they + * work at the whole extent level. + * NOCOW buffered write without data space reserved may not be able + * to fall back to CoW due to lack of data space, thus could cause + * data loss. + * + * Here we take a shortcut by flushing the whole inode, so that all + * nocow write should reach disk as nocow before we increase the + * reference of the extent. We could do better by only flushing NOCOW + * data, but that needs extra accounting. + * + * Also we don't need to check ASYNC_EXTENT, as async extent will be + * CoWed anyway, not affecting nocow part. + */ + ret = filemap_flush(inode_in->i_mapping); + if (ret < 0) + return ret; + + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), + wb_len); + if (ret < 0) + return ret; + ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), + wb_len); + if (ret < 0) + return ret; + + return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + len, remap_flags); +} + +static bool file_sync_write(const struct file *file) +{ + if (file->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(file))) + return true; + + return false; +} + +loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, loff_t len, + unsigned int remap_flags) +{ + struct inode *src_inode = file_inode(src_file); + struct inode *dst_inode = file_inode(dst_file); + bool same_inode = dst_inode == src_inode; + int ret; + + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (same_inode) { + btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); + } else { + lock_two_nondirectories(src_inode, dst_inode); + btrfs_double_mmap_lock(src_inode, dst_inode); + } + + ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, + &len, remap_flags); + if (ret < 0 || len == 0) + goto out_unlock; + + if (remap_flags & REMAP_FILE_DEDUP) + ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); + else + ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); + +out_unlock: + if (same_inode) { + btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); + } else { + btrfs_double_mmap_unlock(src_inode, dst_inode); + unlock_two_nondirectories(src_inode, dst_inode); + } + + /* + * If either the source or the destination file was opened with O_SYNC, + * O_DSYNC or has the S_SYNC attribute, fsync both the destination and + * source files/ranges, so that after a successful return (0) followed + * by a power failure results in the reflinked data to be readable from + * both files/ranges. + */ + if (ret == 0 && len > 0 && + (file_sync_write(src_file) || file_sync_write(dst_file))) { + ret = btrfs_sync_file(src_file, off, off + len - 1, 0); + if (ret == 0) + ret = btrfs_sync_file(dst_file, destoff, + destoff + len - 1, 0); + } + + return ret < 0 ? ret : len; +} diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h new file mode 100644 index 0000000000..ecb309b4da --- /dev/null +++ b/fs/btrfs/reflink.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_REFLINK_H +#define BTRFS_REFLINK_H + +#include + +loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); + +#endif /* BTRFS_REFLINK_H */ diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c new file mode 100644 index 0000000000..4eaac3ae5c --- /dev/null +++ b/fs/btrfs/relocation.c @@ -0,0 +1,4573 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2009 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "volumes.h" +#include "locking.h" +#include "btrfs_inode.h" +#include "async-thread.h" +#include "free-space-cache.h" +#include "qgroup.h" +#include "print-tree.h" +#include "delalloc-space.h" +#include "block-group.h" +#include "backref.h" +#include "misc.h" +#include "subpage.h" +#include "zoned.h" +#include "inode-item.h" +#include "space-info.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "file-item.h" +#include "relocation.h" +#include "super.h" +#include "tree-checker.h" + +/* + * Relocation overview + * + * [What does relocation do] + * + * The objective of relocation is to relocate all extents of the target block + * group to other block groups. + * This is utilized by resize (shrink only), profile converting, compacting + * space, or balance routine to spread chunks over devices. + * + * Before | After + * ------------------------------------------------------------------ + * BG A: 10 data extents | BG A: deleted + * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) + * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) + * + * [How does relocation work] + * + * 1. Mark the target block group read-only + * New extents won't be allocated from the target block group. + * + * 2.1 Record each extent in the target block group + * To build a proper map of extents to be relocated. + * + * 2.2 Build data reloc tree and reloc trees + * Data reloc tree will contain an inode, recording all newly relocated + * data extents. + * There will be only one data reloc tree for one data block group. + * + * Reloc tree will be a special snapshot of its source tree, containing + * relocated tree blocks. + * Each tree referring to a tree block in target block group will get its + * reloc tree built. + * + * 2.3 Swap source tree with its corresponding reloc tree + * Each involved tree only refers to new extents after swap. + * + * 3. Cleanup reloc trees and data reloc tree. + * As old extents in the target block group are still referenced by reloc + * trees, we need to clean them up before really freeing the target block + * group. + * + * The main complexity is in steps 2.2 and 2.3. + * + * The entry point of relocation is relocate_block_group() function. + */ + +#define RELOCATION_RESERVED_NODES 256 +/* + * map address of tree root to tree + */ +struct mapping_node { + struct { + struct rb_node rb_node; + u64 bytenr; + }; /* Use rb_simle_node for search/insert */ + void *data; +}; + +struct mapping_tree { + struct rb_root rb_root; + spinlock_t lock; +}; + +/* + * present a tree block to process + */ +struct tree_block { + struct { + struct rb_node rb_node; + u64 bytenr; + }; /* Use rb_simple_node for search/insert */ + u64 owner; + struct btrfs_key key; + unsigned int level:8; + unsigned int key_ready:1; +}; + +#define MAX_EXTENTS 128 + +struct file_extent_cluster { + u64 start; + u64 end; + u64 boundary[MAX_EXTENTS]; + unsigned int nr; +}; + +struct reloc_control { + /* block group to relocate */ + struct btrfs_block_group *block_group; + /* extent tree */ + struct btrfs_root *extent_root; + /* inode for moving data */ + struct inode *data_inode; + + struct btrfs_block_rsv *block_rsv; + + struct btrfs_backref_cache backref_cache; + + struct file_extent_cluster cluster; + /* tree blocks have been processed */ + struct extent_io_tree processed_blocks; + /* map start of tree root to corresponding reloc tree */ + struct mapping_tree reloc_root_tree; + /* list of reloc trees */ + struct list_head reloc_roots; + /* list of subvolume trees that get relocated */ + struct list_head dirty_subvol_roots; + /* size of metadata reservation for merging reloc trees */ + u64 merging_rsv_size; + /* size of relocated tree nodes */ + u64 nodes_relocated; + /* reserved size for block group relocation*/ + u64 reserved_bytes; + + u64 search_start; + u64 extents_found; + + unsigned int stage:8; + unsigned int create_reloc_tree:1; + unsigned int merge_reloc_tree:1; + unsigned int found_file_extent:1; +}; + +/* stages of data relocation */ +#define MOVE_DATA_EXTENTS 0 +#define UPDATE_DATA_PTRS 1 + +static void mark_block_processed(struct reloc_control *rc, + struct btrfs_backref_node *node) +{ + u32 blocksize; + + if (node->level == 0 || + in_range(node->bytenr, rc->block_group->start, + rc->block_group->length)) { + blocksize = rc->extent_root->fs_info->nodesize; + set_extent_bit(&rc->processed_blocks, node->bytenr, + node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL); + } + node->processed = 1; +} + + +static void mapping_tree_init(struct mapping_tree *tree) +{ + tree->rb_root = RB_ROOT; + spin_lock_init(&tree->lock); +} + +/* + * walk up backref nodes until reach node presents tree root + */ +static struct btrfs_backref_node *walk_up_backref( + struct btrfs_backref_node *node, + struct btrfs_backref_edge *edges[], int *index) +{ + struct btrfs_backref_edge *edge; + int idx = *index; + + while (!list_empty(&node->upper)) { + edge = list_entry(node->upper.next, + struct btrfs_backref_edge, list[LOWER]); + edges[idx++] = edge; + node = edge->node[UPPER]; + } + BUG_ON(node->detached); + *index = idx; + return node; +} + +/* + * walk down backref nodes to find start of next reference path + */ +static struct btrfs_backref_node *walk_down_backref( + struct btrfs_backref_edge *edges[], int *index) +{ + struct btrfs_backref_edge *edge; + struct btrfs_backref_node *lower; + int idx = *index; + + while (idx > 0) { + edge = edges[idx - 1]; + lower = edge->node[LOWER]; + if (list_is_last(&edge->list[LOWER], &lower->upper)) { + idx--; + continue; + } + edge = list_entry(edge->list[LOWER].next, + struct btrfs_backref_edge, list[LOWER]); + edges[idx - 1] = edge; + *index = idx; + return edge->node[UPPER]; + } + *index = 0; + return NULL; +} + +static void update_backref_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node, u64 bytenr) +{ + struct rb_node *rb_node; + rb_erase(&node->rb_node, &cache->rb_root); + node->bytenr = bytenr; + rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node); + if (rb_node) + btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST); +} + +/* + * update backref cache after a transaction commit + */ +static int update_backref_cache(struct btrfs_trans_handle *trans, + struct btrfs_backref_cache *cache) +{ + struct btrfs_backref_node *node; + int level = 0; + + if (cache->last_trans == 0) { + cache->last_trans = trans->transid; + return 0; + } + + if (cache->last_trans == trans->transid) + return 0; + + /* + * detached nodes are used to avoid unnecessary backref + * lookup. transaction commit changes the extent tree. + * so the detached nodes are no longer useful. + */ + while (!list_empty(&cache->detached)) { + node = list_entry(cache->detached.next, + struct btrfs_backref_node, list); + btrfs_backref_cleanup_node(cache, node); + } + + while (!list_empty(&cache->changed)) { + node = list_entry(cache->changed.next, + struct btrfs_backref_node, list); + list_del_init(&node->list); + BUG_ON(node->pending); + update_backref_node(cache, node, node->new_bytenr); + } + + /* + * some nodes can be left in the pending list if there were + * errors during processing the pending nodes. + */ + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + list_for_each_entry(node, &cache->pending[level], list) { + BUG_ON(!node->pending); + if (node->bytenr == node->new_bytenr) + continue; + update_backref_node(cache, node, node->new_bytenr); + } + } + + cache->last_trans = 0; + return 1; +} + +static bool reloc_root_is_dead(struct btrfs_root *root) +{ + /* + * Pair with set_bit/clear_bit in clean_dirty_subvols and + * btrfs_update_reloc_root. We need to see the updated bit before + * trying to access reloc_root + */ + smp_rmb(); + if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) + return true; + return false; +} + +/* + * Check if this subvolume tree has valid reloc tree. + * + * Reloc tree after swap is considered dead, thus not considered as valid. + * This is enough for most callers, as they don't distinguish dead reloc root + * from no reloc root. But btrfs_should_ignore_reloc_root() below is a + * special case. + */ +static bool have_reloc_root(struct btrfs_root *root) +{ + if (reloc_root_is_dead(root)) + return false; + if (!root->reloc_root) + return false; + return true; +} + +int btrfs_should_ignore_reloc_root(struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + return 0; + + /* This root has been merged with its reloc tree, we can ignore it */ + if (reloc_root_is_dead(root)) + return 1; + + reloc_root = root->reloc_root; + if (!reloc_root) + return 0; + + if (btrfs_header_generation(reloc_root->commit_root) == + root->fs_info->running_transaction->transid) + return 0; + /* + * if there is reloc tree and it was created in previous + * transaction backref lookup can find the reloc tree, + * so backref node for the fs tree root is useless for + * relocation. + */ + return 1; +} + +/* + * find reloc tree by address of tree root + */ +struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct reloc_control *rc = fs_info->reloc_ctl; + struct rb_node *rb_node; + struct mapping_node *node; + struct btrfs_root *root = NULL; + + ASSERT(rc); + spin_lock(&rc->reloc_root_tree.lock); + rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + root = node->data; + } + spin_unlock(&rc->reloc_root_tree.lock); + return btrfs_grab_root(root); +} + +/* + * For useless nodes, do two major clean ups: + * + * - Cleanup the children edges and nodes + * If child node is also orphan (no parent) during cleanup, then the child + * node will also be cleaned up. + * + * - Freeing up leaves (level 0), keeps nodes detached + * For nodes, the node is still cached as "detached" + * + * Return false if @node is not in the @useless_nodes list. + * Return true if @node is in the @useless_nodes list. + */ +static bool handle_useless_nodes(struct reloc_control *rc, + struct btrfs_backref_node *node) +{ + struct btrfs_backref_cache *cache = &rc->backref_cache; + struct list_head *useless_node = &cache->useless_node; + bool ret = false; + + while (!list_empty(useless_node)) { + struct btrfs_backref_node *cur; + + cur = list_first_entry(useless_node, struct btrfs_backref_node, + list); + list_del_init(&cur->list); + + /* Only tree root nodes can be added to @useless_nodes */ + ASSERT(list_empty(&cur->upper)); + + if (cur == node) + ret = true; + + /* The node is the lowest node */ + if (cur->lowest) { + list_del_init(&cur->lower); + cur->lowest = 0; + } + + /* Cleanup the lower edges */ + while (!list_empty(&cur->lower)) { + struct btrfs_backref_edge *edge; + struct btrfs_backref_node *lower; + + edge = list_entry(cur->lower.next, + struct btrfs_backref_edge, list[UPPER]); + list_del(&edge->list[UPPER]); + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + btrfs_backref_free_edge(cache, edge); + + /* Child node is also orphan, queue for cleanup */ + if (list_empty(&lower->upper)) + list_add(&lower->list, useless_node); + } + /* Mark this block processed for relocation */ + mark_block_processed(rc, cur); + + /* + * Backref nodes for tree leaves are deleted from the cache. + * Backref nodes for upper level tree blocks are left in the + * cache to avoid unnecessary backref lookup. + */ + if (cur->level > 0) { + list_add(&cur->list, &cache->detached); + cur->detached = 1; + } else { + rb_erase(&cur->rb_node, &cache->rb_root); + btrfs_backref_free_node(cache, cur); + } + } + return ret; +} + +/* + * Build backref tree for a given tree block. Root of the backref tree + * corresponds the tree block, leaves of the backref tree correspond roots of + * b-trees that reference the tree block. + * + * The basic idea of this function is check backrefs of a given block to find + * upper level blocks that reference the block, and then check backrefs of + * these upper level blocks recursively. The recursion stops when tree root is + * reached or backrefs for the block is cached. + * + * NOTE: if we find that backrefs for a block are cached, we know backrefs for + * all upper level blocks that directly/indirectly reference the block are also + * cached. + */ +static noinline_for_stack struct btrfs_backref_node *build_backref_tree( + struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct btrfs_key *node_key, + int level, u64 bytenr) +{ + struct btrfs_backref_iter *iter; + struct btrfs_backref_cache *cache = &rc->backref_cache; + /* For searching parent of TREE_BLOCK_REF */ + struct btrfs_path *path; + struct btrfs_backref_node *cur; + struct btrfs_backref_node *node = NULL; + struct btrfs_backref_edge *edge; + int ret; + int err = 0; + + iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); + if (!iter) + return ERR_PTR(-ENOMEM); + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } + + node = btrfs_backref_alloc_node(cache, bytenr, level); + if (!node) { + err = -ENOMEM; + goto out; + } + + node->lowest = 1; + cur = node; + + /* Breadth-first search to build backref cache */ + do { + ret = btrfs_backref_add_tree_node(trans, cache, path, iter, + node_key, cur); + if (ret < 0) { + err = ret; + goto out; + } + edge = list_first_entry_or_null(&cache->pending_edge, + struct btrfs_backref_edge, list[UPPER]); + /* + * The pending list isn't empty, take the first block to + * process + */ + if (edge) { + list_del_init(&edge->list[UPPER]); + cur = edge->node[UPPER]; + } + } while (edge); + + /* Finish the upper linkage of newly added edges/nodes */ + ret = btrfs_backref_finish_upper_links(cache, node); + if (ret < 0) { + err = ret; + goto out; + } + + if (handle_useless_nodes(rc, node)) + node = NULL; +out: + btrfs_backref_iter_free(iter); + btrfs_free_path(path); + if (err) { + btrfs_backref_error_cleanup(cache, node); + return ERR_PTR(err); + } + ASSERT(!node || !node->detached); + ASSERT(list_empty(&cache->useless_node) && + list_empty(&cache->pending_edge)); + return node; +} + +/* + * helper to add backref node for the newly created snapshot. + * the backref node is created by cloning backref node that + * corresponds to root of source tree + */ +static int clone_backref_node(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *src, + struct btrfs_root *dest) +{ + struct btrfs_root *reloc_root = src->reloc_root; + struct btrfs_backref_cache *cache = &rc->backref_cache; + struct btrfs_backref_node *node = NULL; + struct btrfs_backref_node *new_node; + struct btrfs_backref_edge *edge; + struct btrfs_backref_edge *new_edge; + struct rb_node *rb_node; + + if (cache->last_trans > 0) + update_backref_cache(trans, cache); + + rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); + if (node->detached) + node = NULL; + else + BUG_ON(node->new_bytenr != reloc_root->node->start); + } + + if (!node) { + rb_node = rb_simple_search(&cache->rb_root, + reloc_root->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct btrfs_backref_node, + rb_node); + BUG_ON(node->detached); + } + } + + if (!node) + return 0; + + new_node = btrfs_backref_alloc_node(cache, dest->node->start, + node->level); + if (!new_node) + return -ENOMEM; + + new_node->lowest = node->lowest; + new_node->checked = 1; + new_node->root = btrfs_grab_root(dest); + ASSERT(new_node->root); + + if (!node->lowest) { + list_for_each_entry(edge, &node->lower, list[UPPER]) { + new_edge = btrfs_backref_alloc_edge(cache); + if (!new_edge) + goto fail; + + btrfs_backref_link_edge(new_edge, edge->node[LOWER], + new_node, LINK_UPPER); + } + } else { + list_add_tail(&new_node->lower, &cache->leaves); + } + + rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, + &new_node->rb_node); + if (rb_node) + btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST); + + if (!new_node->lowest) { + list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { + list_add_tail(&new_edge->list[LOWER], + &new_edge->node[LOWER]->upper); + } + } + return 0; +fail: + while (!list_empty(&new_node->lower)) { + new_edge = list_entry(new_node->lower.next, + struct btrfs_backref_edge, list[UPPER]); + list_del(&new_edge->list[UPPER]); + btrfs_backref_free_edge(cache, new_edge); + } + btrfs_backref_free_node(cache, new_node); + return -ENOMEM; +} + +/* + * helper to add 'address of tree root -> reloc tree' mapping + */ +static int __must_check __add_reloc_root(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct rb_node *rb_node; + struct mapping_node *node; + struct reloc_control *rc = fs_info->reloc_ctl; + + node = kmalloc(sizeof(*node), GFP_NOFS); + if (!node) + return -ENOMEM; + + node->bytenr = root->commit_root->start; + node->data = root; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + if (rb_node) { + btrfs_err(fs_info, + "Duplicate root found for start=%llu while inserting into relocation tree", + node->bytenr); + return -EEXIST; + } + + list_add_tail(&root->root_list, &rc->reloc_roots); + return 0; +} + +/* + * helper to delete the 'address of tree root -> reloc tree' + * mapping + */ +static void __del_reloc_root(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct rb_node *rb_node; + struct mapping_node *node = NULL; + struct reloc_control *rc = fs_info->reloc_ctl; + bool put_ref = false; + + if (rc && root->node) { + spin_lock(&rc->reloc_root_tree.lock); + rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, + root->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + RB_CLEAR_NODE(&node->rb_node); + } + spin_unlock(&rc->reloc_root_tree.lock); + ASSERT(!node || (struct btrfs_root *)node->data == root); + } + + /* + * We only put the reloc root here if it's on the list. There's a lot + * of places where the pattern is to splice the rc->reloc_roots, process + * the reloc roots, and then add the reloc root back onto + * rc->reloc_roots. If we call __del_reloc_root while it's off of the + * list we don't want the reference being dropped, because the guy + * messing with the list is in charge of the reference. + */ + spin_lock(&fs_info->trans_lock); + if (!list_empty(&root->root_list)) { + put_ref = true; + list_del_init(&root->root_list); + } + spin_unlock(&fs_info->trans_lock); + if (put_ref) + btrfs_put_root(root); + kfree(node); +} + +/* + * helper to update the 'address of tree root -> reloc tree' + * mapping + */ +static int __update_reloc_root(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct rb_node *rb_node; + struct mapping_node *node = NULL; + struct reloc_control *rc = fs_info->reloc_ctl; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, + root->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + } + spin_unlock(&rc->reloc_root_tree.lock); + + if (!node) + return 0; + BUG_ON((struct btrfs_root *)node->data != root); + + spin_lock(&rc->reloc_root_tree.lock); + node->bytenr = root->node->start; + rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + if (rb_node) + btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); + return 0; +} + +static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *reloc_root; + struct extent_buffer *eb; + struct btrfs_root_item *root_item; + struct btrfs_key root_key; + int ret = 0; + bool must_abort = false; + + root_item = kmalloc(sizeof(*root_item), GFP_NOFS); + if (!root_item) + return ERR_PTR(-ENOMEM); + + root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = objectid; + + if (root->root_key.objectid == objectid) { + u64 commit_root_gen; + + /* called by btrfs_init_reloc_root */ + ret = btrfs_copy_root(trans, root, root->commit_root, &eb, + BTRFS_TREE_RELOC_OBJECTID); + if (ret) + goto fail; + + /* + * Set the last_snapshot field to the generation of the commit + * root - like this ctree.c:btrfs_block_can_be_shared() behaves + * correctly (returns true) when the relocation root is created + * either inside the critical section of a transaction commit + * (through transaction.c:qgroup_account_snapshot()) and when + * it's created before the transaction commit is started. + */ + commit_root_gen = btrfs_header_generation(root->commit_root); + btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); + } else { + /* + * called by btrfs_reloc_post_snapshot_hook. + * the source tree is a reloc tree, all tree blocks + * modified after it was created have RELOC flag + * set in their headers. so it's OK to not update + * the 'last_snapshot'. + */ + ret = btrfs_copy_root(trans, root, root->node, &eb, + BTRFS_TREE_RELOC_OBJECTID); + if (ret) + goto fail; + } + + /* + * We have changed references at this point, we must abort the + * transaction if anything fails. + */ + must_abort = true; + + memcpy(root_item, &root->root_item, sizeof(*root_item)); + btrfs_set_root_bytenr(root_item, eb->start); + btrfs_set_root_level(root_item, btrfs_header_level(eb)); + btrfs_set_root_generation(root_item, trans->transid); + + if (root->root_key.objectid == objectid) { + btrfs_set_root_refs(root_item, 0); + memset(&root_item->drop_progress, 0, + sizeof(struct btrfs_disk_key)); + btrfs_set_root_drop_level(root_item, 0); + } + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + ret = btrfs_insert_root(trans, fs_info->tree_root, + &root_key, root_item); + if (ret) + goto fail; + + kfree(root_item); + + reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); + if (IS_ERR(reloc_root)) { + ret = PTR_ERR(reloc_root); + goto abort; + } + set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); + reloc_root->last_trans = trans->transid; + return reloc_root; +fail: + kfree(root_item); +abort: + if (must_abort) + btrfs_abort_transaction(trans, ret); + return ERR_PTR(ret); +} + +/* + * create reloc tree for a given fs tree. reloc tree is just a + * snapshot of the fs tree with special root objectid. + * + * The reloc_root comes out of here with two references, one for + * root->reloc_root, and another for being on the rc->reloc_roots list. + */ +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *reloc_root; + struct reloc_control *rc = fs_info->reloc_ctl; + struct btrfs_block_rsv *rsv; + int clear_rsv = 0; + int ret; + + if (!rc) + return 0; + + /* + * The subvolume has reloc tree but the swap is finished, no need to + * create/update the dead reloc tree + */ + if (reloc_root_is_dead(root)) + return 0; + + /* + * This is subtle but important. We do not do + * record_root_in_transaction for reloc roots, instead we record their + * corresponding fs root, and then here we update the last trans for the + * reloc root. This means that we have to do this for the entire life + * of the reloc root, regardless of which stage of the relocation we are + * in. + */ + if (root->reloc_root) { + reloc_root = root->reloc_root; + reloc_root->last_trans = trans->transid; + return 0; + } + + /* + * We are merging reloc roots, we do not need new reloc trees. Also + * reloc trees never need their own reloc tree. + */ + if (!rc->create_reloc_tree || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + return 0; + + if (!trans->reloc_reserved) { + rsv = trans->block_rsv; + trans->block_rsv = rc->block_rsv; + clear_rsv = 1; + } + reloc_root = create_reloc_root(trans, root, root->root_key.objectid); + if (clear_rsv) + trans->block_rsv = rsv; + if (IS_ERR(reloc_root)) + return PTR_ERR(reloc_root); + + ret = __add_reloc_root(reloc_root); + ASSERT(ret != -EEXIST); + if (ret) { + /* Pairs with create_reloc_root */ + btrfs_put_root(reloc_root); + return ret; + } + root->reloc_root = btrfs_grab_root(reloc_root); + return 0; +} + +/* + * update root item of reloc tree + */ +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *reloc_root; + struct btrfs_root_item *root_item; + int ret; + + if (!have_reloc_root(root)) + return 0; + + reloc_root = root->reloc_root; + root_item = &reloc_root->root_item; + + /* + * We are probably ok here, but __del_reloc_root() will drop its ref of + * the root. We have the ref for root->reloc_root, but just in case + * hold it while we update the reloc root. + */ + btrfs_grab_root(reloc_root); + + /* root->reloc_root will stay until current relocation finished */ + if (fs_info->reloc_ctl->merge_reloc_tree && + btrfs_root_refs(root_item) == 0) { + set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); + /* + * Mark the tree as dead before we change reloc_root so + * have_reloc_root will not touch it from now on. + */ + smp_wmb(); + __del_reloc_root(reloc_root); + } + + if (reloc_root->commit_root != reloc_root->node) { + __update_reloc_root(reloc_root); + btrfs_set_root_node(root_item, reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + reloc_root->commit_root = btrfs_root_node(reloc_root); + } + + ret = btrfs_update_root(trans, fs_info->tree_root, + &reloc_root->root_key, root_item); + btrfs_put_root(reloc_root); + return ret; +} + +/* + * helper to find first cached inode with inode number >= objectid + * in a subvolume + */ +static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) +{ + struct rb_node *node; + struct rb_node *prev; + struct btrfs_inode *entry; + struct inode *inode; + + spin_lock(&root->inode_lock); +again: + node = root->inode_tree.rb_node; + prev = NULL; + while (node) { + prev = node; + entry = rb_entry(node, struct btrfs_inode, rb_node); + + if (objectid < btrfs_ino(entry)) + node = node->rb_left; + else if (objectid > btrfs_ino(entry)) + node = node->rb_right; + else + break; + } + if (!node) { + while (prev) { + entry = rb_entry(prev, struct btrfs_inode, rb_node); + if (objectid <= btrfs_ino(entry)) { + node = prev; + break; + } + prev = rb_next(prev); + } + } + while (node) { + entry = rb_entry(node, struct btrfs_inode, rb_node); + inode = igrab(&entry->vfs_inode); + if (inode) { + spin_unlock(&root->inode_lock); + return inode; + } + + objectid = btrfs_ino(entry) + 1; + if (cond_resched_lock(&root->inode_lock)) + goto again; + + node = rb_next(node); + } + spin_unlock(&root->inode_lock); + return NULL; +} + +/* + * get new location of data + */ +static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(reloc_inode)->root; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + bytenr -= BTRFS_I(reloc_inode)->index_cnt; + ret = btrfs_lookup_file_extent(NULL, root, path, + btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + BUG_ON(btrfs_file_extent_offset(leaf, fi) || + btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)); + + if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { + ret = -EINVAL; + goto out; + } + + *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +/* + * update file extent items in the tree leaf to point to + * the new locations. + */ +static noinline_for_stack +int replace_file_extents(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root, + struct extent_buffer *leaf) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + struct inode *inode = NULL; + u64 parent; + u64 bytenr; + u64 new_bytenr = 0; + u64 num_bytes; + u64 end; + u32 nritems; + u32 i; + int ret = 0; + int first = 1; + int dirty = 0; + + if (rc->stage != UPDATE_DATA_PTRS) + return 0; + + /* reloc trees always use full backref */ + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + parent = leaf->start; + else + parent = 0; + + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + struct btrfs_ref ref = { 0 }; + + cond_resched(); + btrfs_item_key_to_cpu(leaf, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + if (bytenr == 0) + continue; + if (!in_range(bytenr, rc->block_group->start, + rc->block_group->length)) + continue; + + /* + * if we are modifying block in fs tree, wait for read_folio + * to complete and drop the extent cache + */ + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + if (first) { + inode = find_next_inode(root, key.objectid); + first = 0; + } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { + btrfs_add_delayed_iput(BTRFS_I(inode)); + inode = find_next_inode(root, key.objectid); + } + if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { + struct extent_state *cached_state = NULL; + + end = key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + WARN_ON(!IS_ALIGNED(key.offset, + fs_info->sectorsize)); + WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); + end--; + ret = try_lock_extent(&BTRFS_I(inode)->io_tree, + key.offset, end, + &cached_state); + if (!ret) + continue; + + btrfs_drop_extent_map_range(BTRFS_I(inode), + key.offset, end, true); + unlock_extent(&BTRFS_I(inode)->io_tree, + key.offset, end, &cached_state); + } + } + + ret = get_new_location(rc->data_inode, &new_bytenr, + bytenr, num_bytes); + if (ret) { + /* + * Don't have to abort since we've not changed anything + * in the file extent yet. + */ + break; + } + + btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); + dirty = 1; + + key.offset -= btrfs_file_extent_offset(leaf, fi); + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, + num_bytes, parent); + btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), + key.objectid, key.offset, + root->root_key.objectid, false); + ret = btrfs_inc_extent_ref(trans, &ref); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, + num_bytes, parent); + btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), + key.objectid, key.offset, + root->root_key.objectid, false); + ret = btrfs_free_extent(trans, &ref); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + } + if (dirty) + btrfs_mark_buffer_dirty(trans, leaf); + if (inode) + btrfs_add_delayed_iput(BTRFS_I(inode)); + return ret; +} + +static noinline_for_stack +int memcmp_node_keys(struct extent_buffer *eb, int slot, + struct btrfs_path *path, int level) +{ + struct btrfs_disk_key key1; + struct btrfs_disk_key key2; + btrfs_node_key(eb, &key1, slot); + btrfs_node_key(path->nodes[level], &key2, path->slots[level]); + return memcmp(&key1, &key2, sizeof(key1)); +} + +/* + * try to replace tree blocks in fs tree with the new blocks + * in reloc tree. tree blocks haven't been modified since the + * reloc tree was create can be replaced. + * + * if a block was replaced, level of the block + 1 is returned. + * if no block got replaced, 0 is returned. if there are other + * errors, a negative error number is returned. + */ +static noinline_for_stack +int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, + struct btrfs_root *dest, struct btrfs_root *src, + struct btrfs_path *path, struct btrfs_key *next_key, + int lowest_level, int max_level) +{ + struct btrfs_fs_info *fs_info = dest->fs_info; + struct extent_buffer *eb; + struct extent_buffer *parent; + struct btrfs_ref ref = { 0 }; + struct btrfs_key key; + u64 old_bytenr; + u64 new_bytenr; + u64 old_ptr_gen; + u64 new_ptr_gen; + u64 last_snapshot; + u32 blocksize; + int cow = 0; + int level; + int ret; + int slot; + + ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); + ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + + last_snapshot = btrfs_root_last_snapshot(&src->root_item); +again: + slot = path->slots[lowest_level]; + btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); + + eb = btrfs_lock_root_node(dest); + level = btrfs_header_level(eb); + + if (level < lowest_level) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + return 0; + } + + if (cow) { + ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, + BTRFS_NESTING_COW); + if (ret) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + return ret; + } + } + + if (next_key) { + next_key->objectid = (u64)-1; + next_key->type = (u8)-1; + next_key->offset = (u64)-1; + } + + parent = eb; + while (1) { + level = btrfs_header_level(parent); + ASSERT(level >= lowest_level); + + ret = btrfs_bin_search(parent, 0, &key, &slot); + if (ret < 0) + break; + if (ret && slot > 0) + slot--; + + if (next_key && slot + 1 < btrfs_header_nritems(parent)) + btrfs_node_key_to_cpu(parent, next_key, slot + 1); + + old_bytenr = btrfs_node_blockptr(parent, slot); + blocksize = fs_info->nodesize; + old_ptr_gen = btrfs_node_ptr_generation(parent, slot); + + if (level <= max_level) { + eb = path->nodes[level]; + new_bytenr = btrfs_node_blockptr(eb, + path->slots[level]); + new_ptr_gen = btrfs_node_ptr_generation(eb, + path->slots[level]); + } else { + new_bytenr = 0; + new_ptr_gen = 0; + } + + if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { + ret = level; + break; + } + + if (new_bytenr == 0 || old_ptr_gen > last_snapshot || + memcmp_node_keys(parent, slot, path, level)) { + if (level <= lowest_level) { + ret = 0; + break; + } + + eb = btrfs_read_node_slot(parent, slot); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + break; + } + btrfs_tree_lock(eb); + if (cow) { + ret = btrfs_cow_block(trans, dest, eb, parent, + slot, &eb, + BTRFS_NESTING_COW); + if (ret) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + break; + } + } + + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + + parent = eb; + continue; + } + + if (!cow) { + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + cow = 1; + goto again; + } + + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + btrfs_release_path(path); + + path->lowest_level = level; + set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); + ret = btrfs_search_slot(trans, src, &key, path, 0, 1); + clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); + path->lowest_level = 0; + if (ret) { + if (ret > 0) + ret = -ENOENT; + break; + } + + /* + * Info qgroup to trace both subtrees. + * + * We must trace both trees. + * 1) Tree reloc subtree + * If not traced, we will leak data numbers + * 2) Fs subtree + * If not traced, we will double count old data + * + * We don't scan the subtree right now, but only record + * the swapped tree blocks. + * The real subtree rescan is delayed until we have new + * CoW on the subtree root node before transaction commit. + */ + ret = btrfs_qgroup_add_swapped_blocks(trans, dest, + rc->block_group, parent, slot, + path->nodes[level], path->slots[level], + last_snapshot); + if (ret < 0) + break; + /* + * swap blocks in fs tree and reloc tree. + */ + btrfs_set_node_blockptr(parent, slot, new_bytenr); + btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); + btrfs_mark_buffer_dirty(trans, parent); + + btrfs_set_node_blockptr(path->nodes[level], + path->slots[level], old_bytenr); + btrfs_set_node_ptr_generation(path->nodes[level], + path->slots[level], old_ptr_gen); + btrfs_mark_buffer_dirty(trans, path->nodes[level]); + + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, + blocksize, path->nodes[level]->start); + btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, + 0, true); + ret = btrfs_inc_extent_ref(trans, &ref); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, + blocksize, 0); + btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, + true); + ret = btrfs_inc_extent_ref(trans, &ref); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, + blocksize, path->nodes[level]->start); + btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, + 0, true); + ret = btrfs_free_extent(trans, &ref); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, + blocksize, 0); + btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, + 0, true); + ret = btrfs_free_extent(trans, &ref); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + + btrfs_unlock_up_safe(path, 0); + + ret = level; + break; + } + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + return ret; +} + +/* + * helper to find next relocated block in reloc tree + */ +static noinline_for_stack +int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, + int *level) +{ + struct extent_buffer *eb; + int i; + u64 last_snapshot; + u32 nritems; + + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + + for (i = 0; i < *level; i++) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + + for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { + eb = path->nodes[i]; + nritems = btrfs_header_nritems(eb); + while (path->slots[i] + 1 < nritems) { + path->slots[i]++; + if (btrfs_node_ptr_generation(eb, path->slots[i]) <= + last_snapshot) + continue; + + *level = i; + return 0; + } + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + return 1; +} + +/* + * walk down reloc tree to find relocated block of lowest level + */ +static noinline_for_stack +int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, + int *level) +{ + struct extent_buffer *eb = NULL; + int i; + u64 ptr_gen = 0; + u64 last_snapshot; + u32 nritems; + + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + + for (i = *level; i > 0; i--) { + eb = path->nodes[i]; + nritems = btrfs_header_nritems(eb); + while (path->slots[i] < nritems) { + ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); + if (ptr_gen > last_snapshot) + break; + path->slots[i]++; + } + if (path->slots[i] >= nritems) { + if (i == *level) + break; + *level = i + 1; + return 0; + } + if (i == 1) { + *level = i; + return 0; + } + + eb = btrfs_read_node_slot(eb, path->slots[i]); + if (IS_ERR(eb)) + return PTR_ERR(eb); + BUG_ON(btrfs_header_level(eb) != i - 1); + path->nodes[i - 1] = eb; + path->slots[i - 1] = 0; + } + return 1; +} + +/* + * invalidate extent cache for file extents whose key in range of + * [min_key, max_key) + */ +static int invalidate_extent_cache(struct btrfs_root *root, + struct btrfs_key *min_key, + struct btrfs_key *max_key) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode = NULL; + u64 objectid; + u64 start, end; + u64 ino; + + objectid = min_key->objectid; + while (1) { + struct extent_state *cached_state = NULL; + + cond_resched(); + iput(inode); + + if (objectid > max_key->objectid) + break; + + inode = find_next_inode(root, objectid); + if (!inode) + break; + ino = btrfs_ino(BTRFS_I(inode)); + + if (ino > max_key->objectid) { + iput(inode); + break; + } + + objectid = ino + 1; + if (!S_ISREG(inode->i_mode)) + continue; + + if (unlikely(min_key->objectid == ino)) { + if (min_key->type > BTRFS_EXTENT_DATA_KEY) + continue; + if (min_key->type < BTRFS_EXTENT_DATA_KEY) + start = 0; + else { + start = min_key->offset; + WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); + } + } else { + start = 0; + } + + if (unlikely(max_key->objectid == ino)) { + if (max_key->type < BTRFS_EXTENT_DATA_KEY) + continue; + if (max_key->type > BTRFS_EXTENT_DATA_KEY) { + end = (u64)-1; + } else { + if (max_key->offset == 0) + continue; + end = max_key->offset; + WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); + end--; + } + } else { + end = (u64)-1; + } + + /* the lock_extent waits for read_folio to complete */ + lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); + } + return 0; +} + +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key) + +{ + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) + break; + if (path->slots[level] + 1 < + btrfs_header_nritems(path->nodes[level])) { + btrfs_node_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + return 0; + } + level++; + } + return 1; +} + +/* + * Insert current subvolume into reloc_control::dirty_subvol_roots + */ +static int insert_dirty_subvol(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root = root->reloc_root; + struct btrfs_root_item *reloc_root_item; + int ret; + + /* @root must be a subvolume tree root with a valid reloc tree */ + ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + ASSERT(reloc_root); + + reloc_root_item = &reloc_root->root_item; + memset(&reloc_root_item->drop_progress, 0, + sizeof(reloc_root_item->drop_progress)); + btrfs_set_root_drop_level(reloc_root_item, 0); + btrfs_set_root_refs(reloc_root_item, 0); + ret = btrfs_update_reloc_root(trans, root); + if (ret) + return ret; + + if (list_empty(&root->reloc_dirty_list)) { + btrfs_grab_root(root); + list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); + } + + return 0; +} + +static int clean_dirty_subvols(struct reloc_control *rc) +{ + struct btrfs_root *root; + struct btrfs_root *next; + int ret = 0; + int ret2; + + list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, + reloc_dirty_list) { + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + /* Merged subvolume, cleanup its reloc root */ + struct btrfs_root *reloc_root = root->reloc_root; + + list_del_init(&root->reloc_dirty_list); + root->reloc_root = NULL; + /* + * Need barrier to ensure clear_bit() only happens after + * root->reloc_root = NULL. Pairs with have_reloc_root. + */ + smp_wmb(); + clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); + if (reloc_root) { + /* + * btrfs_drop_snapshot drops our ref we hold for + * ->reloc_root. If it fails however we must + * drop the ref ourselves. + */ + ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); + if (ret2 < 0) { + btrfs_put_root(reloc_root); + if (!ret) + ret = ret2; + } + } + btrfs_put_root(root); + } else { + /* Orphan reloc tree, just clean it up */ + ret2 = btrfs_drop_snapshot(root, 0, 1); + if (ret2 < 0) { + btrfs_put_root(root); + if (!ret) + ret = ret2; + } + } + } + return ret; +} + +/* + * merge the relocated tree blocks in reloc tree with corresponding + * fs tree. + */ +static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + struct btrfs_key key; + struct btrfs_key next_key; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_root *reloc_root; + struct btrfs_root_item *root_item; + struct btrfs_path *path; + struct extent_buffer *leaf; + int reserve_level; + int level; + int max_level; + int replaced = 0; + int ret = 0; + u32 min_reserved; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = READA_FORWARD; + + reloc_root = root->reloc_root; + root_item = &reloc_root->root_item; + + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + level = btrfs_root_level(root_item); + atomic_inc(&reloc_root->node->refs); + path->nodes[level] = reloc_root->node; + path->slots[level] = 0; + } else { + btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + + level = btrfs_root_drop_level(root_item); + BUG_ON(level == 0); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); + path->lowest_level = 0; + if (ret < 0) { + btrfs_free_path(path); + return ret; + } + + btrfs_node_key_to_cpu(path->nodes[level], &next_key, + path->slots[level]); + WARN_ON(memcmp(&key, &next_key, sizeof(key))); + + btrfs_unlock_up_safe(path, 0); + } + + /* + * In merge_reloc_root(), we modify the upper level pointer to swap the + * tree blocks between reloc tree and subvolume tree. Thus for tree + * block COW, we COW at most from level 1 to root level for each tree. + * + * Thus the needed metadata size is at most root_level * nodesize, + * and * 2 since we have two trees to COW. + */ + reserve_level = max_t(int, 1, btrfs_root_level(root_item)); + min_reserved = fs_info->nodesize * reserve_level * 2; + memset(&next_key, 0, sizeof(next_key)); + + while (1) { + ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, + min_reserved, + BTRFS_RESERVE_FLUSH_LIMIT); + if (ret) + goto out; + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + + /* + * At this point we no longer have a reloc_control, so we can't + * depend on btrfs_init_reloc_root to update our last_trans. + * + * But that's ok, we started the trans handle on our + * corresponding fs_root, which means it's been added to the + * dirty list. At commit time we'll still call + * btrfs_update_reloc_root() and update our root item + * appropriately. + */ + reloc_root->last_trans = trans->transid; + trans->block_rsv = rc->block_rsv; + + replaced = 0; + max_level = level; + + ret = walk_down_reloc_tree(reloc_root, path, &level); + if (ret < 0) + goto out; + if (ret > 0) + break; + + if (!find_next_key(path, level, &key) && + btrfs_comp_cpu_keys(&next_key, &key) >= 0) { + ret = 0; + } else { + ret = replace_path(trans, rc, root, reloc_root, path, + &next_key, level, max_level); + } + if (ret < 0) + goto out; + if (ret > 0) { + level = ret; + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + replaced = 1; + } + + ret = walk_up_reloc_tree(reloc_root, path, &level); + if (ret > 0) + break; + + BUG_ON(level == 0); + /* + * save the merging progress in the drop_progress. + * this is OK since root refs == 1 in this case. + */ + btrfs_node_key(path->nodes[level], &root_item->drop_progress, + path->slots[level]); + btrfs_set_root_drop_level(root_item, level); + + btrfs_end_transaction_throttle(trans); + trans = NULL; + + btrfs_btree_balance_dirty(fs_info); + + if (replaced && rc->stage == UPDATE_DATA_PTRS) + invalidate_extent_cache(root, &key, &next_key); + } + + /* + * handle the case only one block in the fs tree need to be + * relocated and the block is tree root. + */ + leaf = btrfs_lock_root_node(root); + ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, + BTRFS_NESTING_COW); + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); +out: + btrfs_free_path(path); + + if (ret == 0) { + ret = insert_dirty_subvol(trans, rc, root); + if (ret) + btrfs_abort_transaction(trans, ret); + } + + if (trans) + btrfs_end_transaction_throttle(trans); + + btrfs_btree_balance_dirty(fs_info); + + if (replaced && rc->stage == UPDATE_DATA_PTRS) + invalidate_extent_cache(root, &key, &next_key); + + return ret; +} + +static noinline_for_stack +int prepare_to_merge(struct reloc_control *rc, int err) +{ + struct btrfs_root *root = rc->extent_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *reloc_root; + struct btrfs_trans_handle *trans; + LIST_HEAD(reloc_roots); + u64 num_bytes = 0; + int ret; + + mutex_lock(&fs_info->reloc_mutex); + rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + rc->merging_rsv_size += rc->nodes_relocated * 2; + mutex_unlock(&fs_info->reloc_mutex); + +again: + if (!err) { + num_bytes = rc->merging_rsv_size; + ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) + err = ret; + } + + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) { + if (!err) + btrfs_block_rsv_release(fs_info, rc->block_rsv, + num_bytes, NULL); + return PTR_ERR(trans); + } + + if (!err) { + if (num_bytes != rc->merging_rsv_size) { + btrfs_end_transaction(trans); + btrfs_block_rsv_release(fs_info, rc->block_rsv, + num_bytes, NULL); + goto again; + } + } + + rc->merge_reloc_tree = 1; + + while (!list_empty(&rc->reloc_roots)) { + reloc_root = list_entry(rc->reloc_roots.next, + struct btrfs_root, root_list); + list_del_init(&reloc_root->root_list); + + root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, + false); + if (IS_ERR(root)) { + /* + * Even if we have an error we need this reloc root + * back on our list so we can clean up properly. + */ + list_add(&reloc_root->root_list, &reloc_roots); + btrfs_abort_transaction(trans, (int)PTR_ERR(root)); + if (!err) + err = PTR_ERR(root); + break; + } + + if (unlikely(root->reloc_root != reloc_root)) { + if (root->reloc_root) { + btrfs_err(fs_info, +"reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu", + root->root_key.objectid, + root->reloc_root->root_key.objectid, + root->reloc_root->root_key.type, + root->reloc_root->root_key.offset, + btrfs_root_generation( + &root->reloc_root->root_item), + reloc_root->root_key.objectid, + reloc_root->root_key.type, + reloc_root->root_key.offset, + btrfs_root_generation( + &reloc_root->root_item)); + } else { + btrfs_err(fs_info, +"reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu", + root->root_key.objectid, + reloc_root->root_key.objectid, + reloc_root->root_key.type, + reloc_root->root_key.offset, + btrfs_root_generation( + &reloc_root->root_item)); + } + list_add(&reloc_root->root_list, &reloc_roots); + btrfs_put_root(root); + btrfs_abort_transaction(trans, -EUCLEAN); + if (!err) + err = -EUCLEAN; + break; + } + + /* + * set reference count to 1, so btrfs_recover_relocation + * knows it should resumes merging + */ + if (!err) + btrfs_set_root_refs(&reloc_root->root_item, 1); + ret = btrfs_update_reloc_root(trans, root); + + /* + * Even if we have an error we need this reloc root back on our + * list so we can clean up properly. + */ + list_add(&reloc_root->root_list, &reloc_roots); + btrfs_put_root(root); + + if (ret) { + btrfs_abort_transaction(trans, ret); + if (!err) + err = ret; + break; + } + } + + list_splice(&reloc_roots, &rc->reloc_roots); + + if (!err) + err = btrfs_commit_transaction(trans); + else + btrfs_end_transaction(trans); + return err; +} + +static noinline_for_stack +void free_reloc_roots(struct list_head *list) +{ + struct btrfs_root *reloc_root, *tmp; + + list_for_each_entry_safe(reloc_root, tmp, list, root_list) + __del_reloc_root(reloc_root); +} + +static noinline_for_stack +void merge_reloc_roots(struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + struct btrfs_root *root; + struct btrfs_root *reloc_root; + LIST_HEAD(reloc_roots); + int found = 0; + int ret = 0; +again: + root = rc->extent_root; + + /* + * this serializes us with btrfs_record_root_in_transaction, + * we have to make sure nobody is in the middle of + * adding their roots to the list while we are + * doing this splice + */ + mutex_lock(&fs_info->reloc_mutex); + list_splice_init(&rc->reloc_roots, &reloc_roots); + mutex_unlock(&fs_info->reloc_mutex); + + while (!list_empty(&reloc_roots)) { + found = 1; + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); + + root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, + false); + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + if (WARN_ON(IS_ERR(root))) { + /* + * For recovery we read the fs roots on mount, + * and if we didn't find the root then we marked + * the reloc root as a garbage root. For normal + * relocation obviously the root should exist in + * memory. However there's no reason we can't + * handle the error properly here just in case. + */ + ret = PTR_ERR(root); + goto out; + } + if (WARN_ON(root->reloc_root != reloc_root)) { + /* + * This can happen if on-disk metadata has some + * corruption, e.g. bad reloc tree key offset. + */ + ret = -EINVAL; + goto out; + } + ret = merge_reloc_root(rc, root); + btrfs_put_root(root); + if (ret) { + if (list_empty(&reloc_root->root_list)) + list_add_tail(&reloc_root->root_list, + &reloc_roots); + goto out; + } + } else { + if (!IS_ERR(root)) { + if (root->reloc_root == reloc_root) { + root->reloc_root = NULL; + btrfs_put_root(reloc_root); + } + clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, + &root->state); + btrfs_put_root(root); + } + + list_del_init(&reloc_root->root_list); + /* Don't forget to queue this reloc root for cleanup */ + list_add_tail(&reloc_root->reloc_dirty_list, + &rc->dirty_subvol_roots); + } + } + + if (found) { + found = 0; + goto again; + } +out: + if (ret) { + btrfs_handle_fs_error(fs_info, ret, NULL); + free_reloc_roots(&reloc_roots); + + /* new reloc root may be added */ + mutex_lock(&fs_info->reloc_mutex); + list_splice_init(&rc->reloc_roots, &reloc_roots); + mutex_unlock(&fs_info->reloc_mutex); + free_reloc_roots(&reloc_roots); + } + + /* + * We used to have + * + * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); + * + * here, but it's wrong. If we fail to start the transaction in + * prepare_to_merge() we will have only 0 ref reloc roots, none of which + * have actually been removed from the reloc_root_tree rb tree. This is + * fine because we're bailing here, and we hold a reference on the root + * for the list that holds it, so these roots will be cleaned up when we + * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root + * will be cleaned up on unmount. + * + * The remaining nodes will be cleaned up by free_reloc_control. + */ +} + +static void free_block_list(struct rb_root *blocks) +{ + struct tree_block *block; + struct rb_node *rb_node; + while ((rb_node = rb_first(blocks))) { + block = rb_entry(rb_node, struct tree_block, rb_node); + rb_erase(rb_node, blocks); + kfree(block); + } +} + +static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *reloc_root) +{ + struct btrfs_fs_info *fs_info = reloc_root->fs_info; + struct btrfs_root *root; + int ret; + + if (reloc_root->last_trans == trans->transid) + return 0; + + root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); + + /* + * This should succeed, since we can't have a reloc root without having + * already looked up the actual root and created the reloc root for this + * root. + * + * However if there's some sort of corruption where we have a ref to a + * reloc root without a corresponding root this could return ENOENT. + */ + if (IS_ERR(root)) { + ASSERT(0); + return PTR_ERR(root); + } + if (root->reloc_root != reloc_root) { + ASSERT(0); + btrfs_err(fs_info, + "root %llu has two reloc roots associated with it", + reloc_root->root_key.offset); + btrfs_put_root(root); + return -EUCLEAN; + } + ret = btrfs_record_root_in_trans(trans, root); + btrfs_put_root(root); + + return ret; +} + +static noinline_for_stack +struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_backref_node *node, + struct btrfs_backref_edge *edges[]) +{ + struct btrfs_backref_node *next; + struct btrfs_root *root; + int index = 0; + int ret; + + next = node; + while (1) { + cond_resched(); + next = walk_up_backref(next, edges, &index); + root = next->root; + + /* + * If there is no root, then our references for this block are + * incomplete, as we should be able to walk all the way up to a + * block that is owned by a root. + * + * This path is only for SHAREABLE roots, so if we come upon a + * non-SHAREABLE root then we have backrefs that resolve + * improperly. + * + * Both of these cases indicate file system corruption, or a bug + * in the backref walking code. + */ + if (!root) { + ASSERT(0); + btrfs_err(trans->fs_info, + "bytenr %llu doesn't have a backref path ending in a root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { + ASSERT(0); + btrfs_err(trans->fs_info, + "bytenr %llu has multiple refs with one ending in a non-shareable root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + ret = record_reloc_root_in_trans(trans, root); + if (ret) + return ERR_PTR(ret); + break; + } + + ret = btrfs_record_root_in_trans(trans, root); + if (ret) + return ERR_PTR(ret); + root = root->reloc_root; + + /* + * We could have raced with another thread which failed, so + * root->reloc_root may not be set, return ENOENT in this case. + */ + if (!root) + return ERR_PTR(-ENOENT); + + if (next->new_bytenr != root->node->start) { + /* + * We just created the reloc root, so we shouldn't have + * ->new_bytenr set and this shouldn't be in the changed + * list. If it is then we have multiple roots pointing + * at the same bytenr which indicates corruption, or + * we've made a mistake in the backref walking code. + */ + ASSERT(next->new_bytenr == 0); + ASSERT(list_empty(&next->list)); + if (next->new_bytenr || !list_empty(&next->list)) { + btrfs_err(trans->fs_info, + "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", + node->bytenr, next->bytenr); + return ERR_PTR(-EUCLEAN); + } + + next->new_bytenr = root->node->start; + btrfs_put_root(next->root); + next->root = btrfs_grab_root(root); + ASSERT(next->root); + list_add_tail(&next->list, + &rc->backref_cache.changed); + mark_block_processed(rc, next); + break; + } + + WARN_ON(1); + root = NULL; + next = walk_down_backref(edges, &index); + if (!next || next->level <= node->level) + break; + } + if (!root) { + /* + * This can happen if there's fs corruption or if there's a bug + * in the backref lookup code. + */ + ASSERT(0); + return ERR_PTR(-ENOENT); + } + + next = node; + /* setup backref node path for btrfs_reloc_cow_block */ + while (1) { + rc->backref_cache.path[next->level] = next; + if (--index < 0) + break; + next = edges[index]->node[UPPER]; + } + return root; +} + +/* + * Select a tree root for relocation. + * + * Return NULL if the block is not shareable. We should use do_relocation() in + * this case. + * + * Return a tree root pointer if the block is shareable. + * Return -ENOENT if the block is root of reloc tree. + */ +static noinline_for_stack +struct btrfs_root *select_one_root(struct btrfs_backref_node *node) +{ + struct btrfs_backref_node *next; + struct btrfs_root *root; + struct btrfs_root *fs_root = NULL; + struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + int index = 0; + + next = node; + while (1) { + cond_resched(); + next = walk_up_backref(next, edges, &index); + root = next->root; + + /* + * This can occur if we have incomplete extent refs leading all + * the way up a particular path, in this case return -EUCLEAN. + */ + if (!root) + return ERR_PTR(-EUCLEAN); + + /* No other choice for non-shareable tree */ + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + return root; + + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) + fs_root = root; + + if (next != node) + return NULL; + + next = walk_down_backref(edges, &index); + if (!next || next->level <= node->level) + break; + } + + if (!fs_root) + return ERR_PTR(-ENOENT); + return fs_root; +} + +static noinline_for_stack +u64 calcu_metadata_size(struct reloc_control *rc, + struct btrfs_backref_node *node, int reserve) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + struct btrfs_backref_node *next = node; + struct btrfs_backref_edge *edge; + struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + u64 num_bytes = 0; + int index = 0; + + BUG_ON(reserve && node->processed); + + while (next) { + cond_resched(); + while (1) { + if (next->processed && (reserve || next != node)) + break; + + num_bytes += fs_info->nodesize; + + if (list_empty(&next->upper)) + break; + + edge = list_entry(next->upper.next, + struct btrfs_backref_edge, list[LOWER]); + edges[index++] = edge; + next = edge->node[UPPER]; + } + next = walk_down_backref(edges, &index); + } + return num_bytes; +} + +static int reserve_metadata_space(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_backref_node *node) +{ + struct btrfs_root *root = rc->extent_root; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 num_bytes; + int ret; + u64 tmp; + + num_bytes = calcu_metadata_size(rc, node, 1) * 2; + + trans->block_rsv = rc->block_rsv; + rc->reserved_bytes += num_bytes; + + /* + * We are under a transaction here so we can only do limited flushing. + * If we get an enospc just kick back -EAGAIN so we know to drop the + * transaction and try to refill when we can flush all the things. + */ + ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_LIMIT); + if (ret) { + tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; + while (tmp <= rc->reserved_bytes) + tmp <<= 1; + /* + * only one thread can access block_rsv at this point, + * so we don't need hold lock to protect block_rsv. + * we expand more reservation size here to allow enough + * space for relocation and we will return earlier in + * enospc case. + */ + rc->block_rsv->size = tmp + fs_info->nodesize * + RELOCATION_RESERVED_NODES; + return -EAGAIN; + } + + return 0; +} + +/* + * relocate a block tree, and then update pointers in upper level + * blocks that reference the block to point to the new location. + * + * if called by link_to_upper, the block has already been relocated. + * in that case this function just updates pointers. + */ +static int do_relocation(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_backref_node *node, + struct btrfs_key *key, + struct btrfs_path *path, int lowest) +{ + struct btrfs_backref_node *upper; + struct btrfs_backref_edge *edge; + struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + struct btrfs_root *root; + struct extent_buffer *eb; + u32 blocksize; + u64 bytenr; + int slot; + int ret = 0; + + /* + * If we are lowest then this is the first time we're processing this + * block, and thus shouldn't have an eb associated with it yet. + */ + ASSERT(!lowest || !node->eb); + + path->lowest_level = node->level + 1; + rc->backref_cache.path[node->level] = node; + list_for_each_entry(edge, &node->upper, list[LOWER]) { + struct btrfs_ref ref = { 0 }; + + cond_resched(); + + upper = edge->node[UPPER]; + root = select_reloc_root(trans, rc, upper, edges); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto next; + } + + if (upper->eb && !upper->locked) { + if (!lowest) { + ret = btrfs_bin_search(upper->eb, 0, key, &slot); + if (ret < 0) + goto next; + BUG_ON(ret); + bytenr = btrfs_node_blockptr(upper->eb, slot); + if (node->eb->start == bytenr) + goto next; + } + btrfs_backref_drop_node_buffer(upper); + } + + if (!upper->eb) { + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + + btrfs_release_path(path); + break; + } + + if (!upper->eb) { + upper->eb = path->nodes[upper->level]; + path->nodes[upper->level] = NULL; + } else { + BUG_ON(upper->eb != path->nodes[upper->level]); + } + + upper->locked = 1; + path->locks[upper->level] = 0; + + slot = path->slots[upper->level]; + btrfs_release_path(path); + } else { + ret = btrfs_bin_search(upper->eb, 0, key, &slot); + if (ret < 0) + goto next; + BUG_ON(ret); + } + + bytenr = btrfs_node_blockptr(upper->eb, slot); + if (lowest) { + if (bytenr != node->bytenr) { + btrfs_err(root->fs_info, + "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", + bytenr, node->bytenr, slot, + upper->eb->start); + ret = -EIO; + goto next; + } + } else { + if (node->eb->start == bytenr) + goto next; + } + + blocksize = root->fs_info->nodesize; + eb = btrfs_read_node_slot(upper->eb, slot); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto next; + } + btrfs_tree_lock(eb); + + if (!node->eb) { + ret = btrfs_cow_block(trans, root, eb, upper->eb, + slot, &eb, BTRFS_NESTING_COW); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + if (ret < 0) + goto next; + /* + * We've just COWed this block, it should have updated + * the correct backref node entry. + */ + ASSERT(node->eb == eb); + } else { + btrfs_set_node_blockptr(upper->eb, slot, + node->eb->start); + btrfs_set_node_ptr_generation(upper->eb, slot, + trans->transid); + btrfs_mark_buffer_dirty(trans, upper->eb); + + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, + node->eb->start, blocksize, + upper->eb->start); + btrfs_init_tree_ref(&ref, node->level, + btrfs_header_owner(upper->eb), + root->root_key.objectid, false); + ret = btrfs_inc_extent_ref(trans, &ref); + if (!ret) + ret = btrfs_drop_subtree(trans, root, eb, + upper->eb); + if (ret) + btrfs_abort_transaction(trans, ret); + } +next: + if (!upper->pending) + btrfs_backref_drop_node_buffer(upper); + else + btrfs_backref_unlock_node_buffer(upper); + if (ret) + break; + } + + if (!ret && node->pending) { + btrfs_backref_drop_node_buffer(node); + list_move_tail(&node->list, &rc->backref_cache.changed); + node->pending = 0; + } + + path->lowest_level = 0; + + /* + * We should have allocated all of our space in the block rsv and thus + * shouldn't ENOSPC. + */ + ASSERT(ret != -ENOSPC); + return ret; +} + +static int link_to_upper(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_backref_node *node, + struct btrfs_path *path) +{ + struct btrfs_key key; + + btrfs_node_key_to_cpu(node->eb, &key, 0); + return do_relocation(trans, rc, node, &key, path, 0); +} + +static int finish_pending_nodes(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_path *path, int err) +{ + LIST_HEAD(list); + struct btrfs_backref_cache *cache = &rc->backref_cache; + struct btrfs_backref_node *node; + int level; + int ret; + + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + while (!list_empty(&cache->pending[level])) { + node = list_entry(cache->pending[level].next, + struct btrfs_backref_node, list); + list_move_tail(&node->list, &list); + BUG_ON(!node->pending); + + if (!err) { + ret = link_to_upper(trans, rc, node, path); + if (ret < 0) + err = ret; + } + } + list_splice_init(&list, &cache->pending[level]); + } + return err; +} + +/* + * mark a block and all blocks directly/indirectly reference the block + * as processed. + */ +static void update_processed_blocks(struct reloc_control *rc, + struct btrfs_backref_node *node) +{ + struct btrfs_backref_node *next = node; + struct btrfs_backref_edge *edge; + struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + int index = 0; + + while (next) { + cond_resched(); + while (1) { + if (next->processed) + break; + + mark_block_processed(rc, next); + + if (list_empty(&next->upper)) + break; + + edge = list_entry(next->upper.next, + struct btrfs_backref_edge, list[LOWER]); + edges[index++] = edge; + next = edge->node[UPPER]; + } + next = walk_down_backref(edges, &index); + } +} + +static int tree_block_processed(u64 bytenr, struct reloc_control *rc) +{ + u32 blocksize = rc->extent_root->fs_info->nodesize; + + if (test_range_bit(&rc->processed_blocks, bytenr, + bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) + return 1; + return 0; +} + +static int get_tree_block_key(struct btrfs_fs_info *fs_info, + struct tree_block *block) +{ + struct btrfs_tree_parent_check check = { + .level = block->level, + .owner_root = block->owner, + .transid = block->key.offset + }; + struct extent_buffer *eb; + + eb = read_tree_block(fs_info, block->bytenr, &check); + if (IS_ERR(eb)) + return PTR_ERR(eb); + if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return -EIO; + } + if (block->level == 0) + btrfs_item_key_to_cpu(eb, &block->key, 0); + else + btrfs_node_key_to_cpu(eb, &block->key, 0); + free_extent_buffer(eb); + block->key_ready = 1; + return 0; +} + +/* + * helper function to relocate a tree block + */ +static int relocate_tree_block(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_backref_node *node, + struct btrfs_key *key, + struct btrfs_path *path) +{ + struct btrfs_root *root; + int ret = 0; + + if (!node) + return 0; + + /* + * If we fail here we want to drop our backref_node because we are going + * to start over and regenerate the tree for it. + */ + ret = reserve_metadata_space(trans, rc, node); + if (ret) + goto out; + + BUG_ON(node->processed); + root = select_one_root(node); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + + /* See explanation in select_one_root for the -EUCLEAN case. */ + ASSERT(ret == -ENOENT); + if (ret == -ENOENT) { + ret = 0; + update_processed_blocks(rc, node); + } + goto out; + } + + if (root) { + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { + /* + * This block was the root block of a root, and this is + * the first time we're processing the block and thus it + * should not have had the ->new_bytenr modified and + * should have not been included on the changed list. + * + * However in the case of corruption we could have + * multiple refs pointing to the same block improperly, + * and thus we would trip over these checks. ASSERT() + * for the developer case, because it could indicate a + * bug in the backref code, however error out for a + * normal user in the case of corruption. + */ + ASSERT(node->new_bytenr == 0); + ASSERT(list_empty(&node->list)); + if (node->new_bytenr || !list_empty(&node->list)) { + btrfs_err(root->fs_info, + "bytenr %llu has improper references to it", + node->bytenr); + ret = -EUCLEAN; + goto out; + } + ret = btrfs_record_root_in_trans(trans, root); + if (ret) + goto out; + /* + * Another thread could have failed, need to check if we + * have reloc_root actually set. + */ + if (!root->reloc_root) { + ret = -ENOENT; + goto out; + } + root = root->reloc_root; + node->new_bytenr = root->node->start; + btrfs_put_root(node->root); + node->root = btrfs_grab_root(root); + ASSERT(node->root); + list_add_tail(&node->list, &rc->backref_cache.changed); + } else { + path->lowest_level = node->level; + if (root == root->fs_info->chunk_root) + btrfs_reserve_chunk_metadata(trans, false); + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + btrfs_release_path(path); + if (root == root->fs_info->chunk_root) + btrfs_trans_release_chunk_metadata(trans); + if (ret > 0) + ret = 0; + } + if (!ret) + update_processed_blocks(rc, node); + } else { + ret = do_relocation(trans, rc, node, key, path, 1); + } +out: + if (ret || node->level == 0 || node->cowonly) + btrfs_backref_cleanup_node(&rc->backref_cache, node); + return ret; +} + +/* + * relocate a list of blocks + */ +static noinline_for_stack +int relocate_tree_blocks(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct rb_root *blocks) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + struct btrfs_backref_node *node; + struct btrfs_path *path; + struct tree_block *block; + struct tree_block *next; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out_free_blocks; + } + + /* Kick in readahead for tree blocks with missing keys */ + rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { + if (!block->key_ready) + btrfs_readahead_tree_block(fs_info, block->bytenr, + block->owner, 0, + block->level); + } + + /* Get first keys */ + rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { + if (!block->key_ready) { + err = get_tree_block_key(fs_info, block); + if (err) + goto out_free_path; + } + } + + /* Do tree relocation */ + rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { + node = build_backref_tree(trans, rc, &block->key, + block->level, block->bytenr); + if (IS_ERR(node)) { + err = PTR_ERR(node); + goto out; + } + + ret = relocate_tree_block(trans, rc, node, &block->key, + path); + if (ret < 0) { + err = ret; + break; + } + } +out: + err = finish_pending_nodes(trans, rc, path, err); + +out_free_path: + btrfs_free_path(path); +out_free_blocks: + free_block_list(blocks); + return err; +} + +static noinline_for_stack int prealloc_file_extent_cluster( + struct btrfs_inode *inode, + struct file_extent_cluster *cluster) +{ + u64 alloc_hint = 0; + u64 start; + u64 end; + u64 offset = inode->index_cnt; + u64 num_bytes; + int nr; + int ret = 0; + u64 i_size = i_size_read(&inode->vfs_inode); + u64 prealloc_start = cluster->start - offset; + u64 prealloc_end = cluster->end - offset; + u64 cur_offset = prealloc_start; + + /* + * For subpage case, previous i_size may not be aligned to PAGE_SIZE. + * This means the range [i_size, PAGE_END + 1) is filled with zeros by + * btrfs_do_readpage() call of previously relocated file cluster. + * + * If the current cluster starts in the above range, btrfs_do_readpage() + * will skip the read, and relocate_one_page() will later writeback + * the padding zeros as new data, causing data corruption. + * + * Here we have to manually invalidate the range (i_size, PAGE_END + 1). + */ + if (!PAGE_ALIGNED(i_size)) { + struct address_space *mapping = inode->vfs_inode.i_mapping; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 sectorsize = fs_info->sectorsize; + struct page *page; + + ASSERT(sectorsize < PAGE_SIZE); + ASSERT(IS_ALIGNED(i_size, sectorsize)); + + /* + * Subpage can't handle page with DIRTY but without UPTODATE + * bit as it can lead to the following deadlock: + * + * btrfs_read_folio() + * | Page already *locked* + * |- btrfs_lock_and_flush_ordered_range() + * |- btrfs_start_ordered_extent() + * |- extent_write_cache_pages() + * |- lock_page() + * We try to lock the page we already hold. + * + * Here we just writeback the whole data reloc inode, so that + * we will be ensured to have no dirty range in the page, and + * are safe to clear the uptodate bits. + * + * This shouldn't cause too much overhead, as we need to write + * the data back anyway. + */ + ret = filemap_write_and_wait(mapping); + if (ret < 0) + return ret; + + clear_extent_bits(&inode->io_tree, i_size, + round_up(i_size, PAGE_SIZE) - 1, + EXTENT_UPTODATE); + page = find_lock_page(mapping, i_size >> PAGE_SHIFT); + /* + * If page is freed we don't need to do anything then, as we + * will re-read the whole page anyway. + */ + if (page) { + btrfs_subpage_clear_uptodate(fs_info, page, i_size, + round_up(i_size, PAGE_SIZE) - i_size); + unlock_page(page); + put_page(page); + } + } + + BUG_ON(cluster->start != cluster->boundary[0]); + ret = btrfs_alloc_data_chunk_ondemand(inode, + prealloc_end + 1 - prealloc_start); + if (ret) + return ret; + + btrfs_inode_lock(inode, 0); + for (nr = 0; nr < cluster->nr; nr++) { + struct extent_state *cached_state = NULL; + + start = cluster->boundary[nr] - offset; + if (nr + 1 < cluster->nr) + end = cluster->boundary[nr + 1] - 1 - offset; + else + end = cluster->end - offset; + + lock_extent(&inode->io_tree, start, end, &cached_state); + num_bytes = end + 1 - start; + ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, + num_bytes, num_bytes, + end + 1, &alloc_hint); + cur_offset = end + 1; + unlock_extent(&inode->io_tree, start, end, &cached_state); + if (ret) + break; + } + btrfs_inode_unlock(inode, 0); + + if (cur_offset < prealloc_end) + btrfs_free_reserved_data_space_noquota(inode->root->fs_info, + prealloc_end + 1 - cur_offset); + return ret; +} + +static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode, + u64 start, u64 end, u64 block_start) +{ + struct extent_map *em; + struct extent_state *cached_state = NULL; + int ret = 0; + + em = alloc_extent_map(); + if (!em) + return -ENOMEM; + + em->start = start; + em->len = end + 1 - start; + em->block_len = em->len; + em->block_start = block_start; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); + ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); + free_extent_map(em); + + return ret; +} + +/* + * Allow error injection to test balance/relocation cancellation + */ +noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) +{ + return atomic_read(&fs_info->balance_cancel_req) || + atomic_read(&fs_info->reloc_cancel_req) || + fatal_signal_pending(current); +} +ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); + +static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster, + int cluster_nr) +{ + /* Last extent, use cluster end directly */ + if (cluster_nr >= cluster->nr - 1) + return cluster->end; + + /* Use next boundary start*/ + return cluster->boundary[cluster_nr + 1] - 1; +} + +static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, + struct file_extent_cluster *cluster, + int *cluster_nr, unsigned long page_index) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + u64 offset = BTRFS_I(inode)->index_cnt; + const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + struct page *page; + u64 page_start; + u64 page_end; + u64 cur; + int ret; + + ASSERT(page_index <= last_index); + page = find_lock_page(inode->i_mapping, page_index); + if (!page) { + page_cache_sync_readahead(inode->i_mapping, ra, NULL, + page_index, last_index + 1 - page_index); + page = find_or_create_page(inode->i_mapping, page_index, mask); + if (!page) + return -ENOMEM; + } + + if (PageReadahead(page)) + page_cache_async_readahead(inode->i_mapping, ra, NULL, + page_folio(page), page_index, + last_index + 1 - page_index); + + if (!PageUptodate(page)) { + btrfs_read_folio(NULL, page_folio(page)); + lock_page(page); + if (!PageUptodate(page)) { + ret = -EIO; + goto release_page; + } + } + + /* + * We could have lost page private when we dropped the lock to read the + * page above, make sure we set_page_extent_mapped here so we have any + * of the subpage blocksize stuff we need in place. + */ + ret = set_page_extent_mapped(page); + if (ret < 0) + goto release_page; + + page_start = page_offset(page); + page_end = page_start + PAGE_SIZE - 1; + + /* + * Start from the cluster, as for subpage case, the cluster can start + * inside the page. + */ + cur = max(page_start, cluster->boundary[*cluster_nr] - offset); + while (cur <= page_end) { + struct extent_state *cached_state = NULL; + u64 extent_start = cluster->boundary[*cluster_nr] - offset; + u64 extent_end = get_cluster_boundary_end(cluster, + *cluster_nr) - offset; + u64 clamped_start = max(page_start, extent_start); + u64 clamped_end = min(page_end, extent_end); + u32 clamped_len = clamped_end + 1 - clamped_start; + + /* Reserve metadata for this range */ + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), + clamped_len, clamped_len, + false); + if (ret) + goto release_page; + + /* Mark the range delalloc and dirty for later writeback */ + lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, + &cached_state); + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, + clamped_end, 0, &cached_state); + if (ret) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, + clamped_start, clamped_end, + EXTENT_LOCKED | EXTENT_BOUNDARY, + &cached_state); + btrfs_delalloc_release_metadata(BTRFS_I(inode), + clamped_len, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), + clamped_len); + goto release_page; + } + btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len); + + /* + * Set the boundary if it's inside the page. + * Data relocation requires the destination extents to have the + * same size as the source. + * EXTENT_BOUNDARY bit prevents current extent from being merged + * with previous extent. + */ + if (in_range(cluster->boundary[*cluster_nr] - offset, + page_start, PAGE_SIZE)) { + u64 boundary_start = cluster->boundary[*cluster_nr] - + offset; + u64 boundary_end = boundary_start + + fs_info->sectorsize - 1; + + set_extent_bit(&BTRFS_I(inode)->io_tree, + boundary_start, boundary_end, + EXTENT_BOUNDARY, NULL); + } + unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, + &cached_state); + btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); + cur += clamped_len; + + /* Crossed extent end, go to next extent */ + if (cur >= extent_end) { + (*cluster_nr)++; + /* Just finished the last extent of the cluster, exit. */ + if (*cluster_nr >= cluster->nr) + break; + } + } + unlock_page(page); + put_page(page); + + balance_dirty_pages_ratelimited(inode->i_mapping); + btrfs_throttle(fs_info); + if (btrfs_should_cancel_balance(fs_info)) + ret = -ECANCELED; + return ret; + +release_page: + unlock_page(page); + put_page(page); + return ret; +} + +static int relocate_file_extent_cluster(struct inode *inode, + struct file_extent_cluster *cluster) +{ + u64 offset = BTRFS_I(inode)->index_cnt; + unsigned long index; + unsigned long last_index; + struct file_ra_state *ra; + int cluster_nr = 0; + int ret = 0; + + if (!cluster->nr) + return 0; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + + ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster); + if (ret) + goto out; + + file_ra_state_init(ra, inode->i_mapping); + + ret = setup_relocation_extent_mapping(inode, cluster->start - offset, + cluster->end - offset, cluster->start); + if (ret) + goto out; + + last_index = (cluster->end - offset) >> PAGE_SHIFT; + for (index = (cluster->start - offset) >> PAGE_SHIFT; + index <= last_index && !ret; index++) + ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index); + if (ret == 0) + WARN_ON(cluster_nr != cluster->nr); +out: + kfree(ra); + return ret; +} + +static noinline_for_stack +int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, + struct file_extent_cluster *cluster) +{ + int ret; + + if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { + ret = relocate_file_extent_cluster(inode, cluster); + if (ret) + return ret; + cluster->nr = 0; + } + + if (!cluster->nr) + cluster->start = extent_key->objectid; + else + BUG_ON(cluster->nr >= MAX_EXTENTS); + cluster->end = extent_key->objectid + extent_key->offset - 1; + cluster->boundary[cluster->nr] = extent_key->objectid; + cluster->nr++; + + if (cluster->nr >= MAX_EXTENTS) { + ret = relocate_file_extent_cluster(inode, cluster); + if (ret) + return ret; + cluster->nr = 0; + } + return 0; +} + +/* + * helper to add a tree block to the list. + * the major work is getting the generation and level of the block + */ +static int add_tree_block(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct btrfs_path *path, + struct rb_root *blocks) +{ + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_tree_block_info *bi; + struct tree_block *block; + struct rb_node *rb_node; + u32 item_size; + int level = -1; + u64 generation; + u64 owner = 0; + + eb = path->nodes[0]; + item_size = btrfs_item_size(eb, path->slots[0]); + + if (extent_key->type == BTRFS_METADATA_ITEM_KEY || + item_size >= sizeof(*ei) + sizeof(*bi)) { + unsigned long ptr = 0, end; + + ei = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_item); + end = (unsigned long)ei + item_size; + if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { + bi = (struct btrfs_tree_block_info *)(ei + 1); + level = btrfs_tree_block_level(eb, bi); + ptr = (unsigned long)(bi + 1); + } else { + level = (int)extent_key->offset; + ptr = (unsigned long)(ei + 1); + } + generation = btrfs_extent_generation(eb, ei); + + /* + * We're reading random blocks without knowing their owner ahead + * of time. This is ok most of the time, as all reloc roots and + * fs roots have the same lock type. However normal trees do + * not, and the only way to know ahead of time is to read the + * inline ref offset. We know it's an fs root if + * + * 1. There's more than one ref. + * 2. There's a SHARED_DATA_REF_KEY set. + * 3. FULL_BACKREF is set on the flags. + * + * Otherwise it's safe to assume that the ref offset == the + * owner of this block, so we can use that when calling + * read_tree_block. + */ + if (btrfs_extent_refs(eb, ei) == 1 && + !(btrfs_extent_flags(eb, ei) & + BTRFS_BLOCK_FLAG_FULL_BACKREF) && + ptr < end) { + struct btrfs_extent_inline_ref *iref; + int type; + + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_get_extent_inline_ref_type(eb, iref, + BTRFS_REF_TYPE_BLOCK); + if (type == BTRFS_REF_TYPE_INVALID) + return -EINVAL; + if (type == BTRFS_TREE_BLOCK_REF_KEY) + owner = btrfs_extent_inline_ref_offset(eb, iref); + } + } else { + btrfs_print_leaf(eb); + btrfs_err(rc->block_group->fs_info, + "unrecognized tree backref at tree block %llu slot %u", + eb->start, path->slots[0]); + btrfs_release_path(path); + return -EUCLEAN; + } + + btrfs_release_path(path); + + BUG_ON(level == -1); + + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) + return -ENOMEM; + + block->bytenr = extent_key->objectid; + block->key.objectid = rc->extent_root->fs_info->nodesize; + block->key.offset = generation; + block->level = level; + block->key_ready = 0; + block->owner = owner; + + rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); + if (rb_node) + btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, + -EEXIST); + + return 0; +} + +/* + * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY + */ +static int __add_tree_block(struct reloc_control *rc, + u64 bytenr, u32 blocksize, + struct rb_root *blocks) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + struct btrfs_path *path; + struct btrfs_key key; + int ret; + bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); + + if (tree_block_processed(bytenr, rc)) + return 0; + + if (rb_simple_search(blocks, bytenr)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; +again: + key.objectid = bytenr; + if (skinny) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = (u64)-1; + } else { + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = blocksize; + } + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + if (ret > 0 && skinny) { + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + (key.type == BTRFS_METADATA_ITEM_KEY || + (key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == blocksize))) + ret = 0; + } + + if (ret) { + skinny = false; + btrfs_release_path(path); + goto again; + } + } + if (ret) { + ASSERT(ret == 1); + btrfs_print_leaf(path->nodes[0]); + btrfs_err(fs_info, + "tree block extent item (%llu) is not found in extent tree", + bytenr); + WARN_ON(1); + ret = -EINVAL; + goto out; + } + + ret = add_tree_block(rc, &key, path, blocks); +out: + btrfs_free_path(path); + return ret; +} + +static int delete_block_group_cache(struct btrfs_fs_info *fs_info, + struct btrfs_block_group *block_group, + struct inode *inode, + u64 ino) +{ + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (inode) + goto truncate; + + inode = btrfs_iget(fs_info->sb, ino, root); + if (IS_ERR(inode)) + return -ENOENT; + +truncate: + ret = btrfs_check_trunc_cache_free_space(fs_info, + &fs_info->global_block_rsv); + if (ret) + goto out; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + ret = btrfs_truncate_free_space_cache(trans, block_group, inode); + + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); +out: + iput(inode); + return ret; +} + +/* + * Locate the free space cache EXTENT_DATA in root tree leaf and delete the + * cache inode, to avoid free space cache data extent blocking data relocation. + */ +static int delete_v1_space_cache(struct extent_buffer *leaf, + struct btrfs_block_group *block_group, + u64 data_bytenr) +{ + u64 space_cache_ino; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; + bool found = false; + int i; + int ret; + + if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) + return 0; + + for (i = 0; i < btrfs_header_nritems(leaf); i++) { + u8 type; + + btrfs_item_key_to_cpu(leaf, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + type = btrfs_file_extent_type(leaf, ei); + + if ((type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) && + btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { + found = true; + space_cache_ino = key.objectid; + break; + } + } + if (!found) + return -ENOENT; + ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, + space_cache_ino); + return ret; +} + +/* + * helper to find all tree blocks that reference a given data extent + */ +static noinline_for_stack +int add_data_references(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct btrfs_path *path, + struct rb_root *blocks) +{ + struct btrfs_backref_walk_ctx ctx = { 0 }; + struct ulist_iterator leaf_uiter; + struct ulist_node *ref_node = NULL; + const u32 blocksize = rc->extent_root->fs_info->nodesize; + int ret = 0; + + btrfs_release_path(path); + + ctx.bytenr = extent_key->objectid; + ctx.skip_inode_ref_list = true; + ctx.fs_info = rc->extent_root->fs_info; + + ret = btrfs_find_all_leafs(&ctx); + if (ret < 0) + return ret; + + ULIST_ITER_INIT(&leaf_uiter); + while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) { + struct btrfs_tree_parent_check check = { 0 }; + struct extent_buffer *eb; + + eb = read_tree_block(ctx.fs_info, ref_node->val, &check); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + break; + } + ret = delete_v1_space_cache(eb, rc->block_group, + extent_key->objectid); + free_extent_buffer(eb); + if (ret < 0) + break; + ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); + if (ret < 0) + break; + } + if (ret < 0) + free_block_list(blocks); + ulist_free(ctx.refs); + return ret; +} + +/* + * helper to find next unprocessed extent + */ +static noinline_for_stack +int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, + struct btrfs_key *extent_key) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + struct btrfs_key key; + struct extent_buffer *leaf; + u64 start, end, last; + int ret; + + last = rc->block_group->start + rc->block_group->length; + while (1) { + bool block_found; + + cond_resched(); + if (rc->search_start >= last) { + ret = 1; + break; + } + + key.objectid = rc->search_start; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, + 0, 0); + if (ret < 0) + break; +next: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret != 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid >= last) { + ret = 1; + break; + } + + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) { + path->slots[0]++; + goto next; + } + + if (key.type == BTRFS_EXTENT_ITEM_KEY && + key.objectid + key.offset <= rc->search_start) { + path->slots[0]++; + goto next; + } + + if (key.type == BTRFS_METADATA_ITEM_KEY && + key.objectid + fs_info->nodesize <= + rc->search_start) { + path->slots[0]++; + goto next; + } + + block_found = find_first_extent_bit(&rc->processed_blocks, + key.objectid, &start, &end, + EXTENT_DIRTY, NULL); + + if (block_found && start <= key.objectid) { + btrfs_release_path(path); + rc->search_start = end + 1; + } else { + if (key.type == BTRFS_EXTENT_ITEM_KEY) + rc->search_start = key.objectid + key.offset; + else + rc->search_start = key.objectid + + fs_info->nodesize; + memcpy(extent_key, &key, sizeof(key)); + return 0; + } + } + btrfs_release_path(path); + return ret; +} + +static void set_reloc_control(struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + + mutex_lock(&fs_info->reloc_mutex); + fs_info->reloc_ctl = rc; + mutex_unlock(&fs_info->reloc_mutex); +} + +static void unset_reloc_control(struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + + mutex_lock(&fs_info->reloc_mutex); + fs_info->reloc_ctl = NULL; + mutex_unlock(&fs_info->reloc_mutex); +} + +static noinline_for_stack +int prepare_to_relocate(struct reloc_control *rc) +{ + struct btrfs_trans_handle *trans; + int ret; + + rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, + BTRFS_BLOCK_RSV_TEMP); + if (!rc->block_rsv) + return -ENOMEM; + + memset(&rc->cluster, 0, sizeof(rc->cluster)); + rc->search_start = rc->block_group->start; + rc->extents_found = 0; + rc->nodes_relocated = 0; + rc->merging_rsv_size = 0; + rc->reserved_bytes = 0; + rc->block_rsv->size = rc->extent_root->fs_info->nodesize * + RELOCATION_RESERVED_NODES; + ret = btrfs_block_rsv_refill(rc->extent_root->fs_info, + rc->block_rsv, rc->block_rsv->size, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) + return ret; + + rc->create_reloc_tree = 1; + set_reloc_control(rc); + + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) { + unset_reloc_control(rc); + /* + * extent tree is not a ref_cow tree and has no reloc_root to + * cleanup. And callers are responsible to free the above + * block rsv. + */ + return PTR_ERR(trans); + } + + ret = btrfs_commit_transaction(trans); + if (ret) + unset_reloc_control(rc); + + return ret; +} + +static noinline_for_stack int relocate_block_group(struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + struct rb_root blocks = RB_ROOT; + struct btrfs_key key; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + u64 flags; + int ret; + int err = 0; + int progress = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = READA_FORWARD; + + ret = prepare_to_relocate(rc); + if (ret) { + err = ret; + goto out_free; + } + + while (1) { + rc->reserved_bytes = 0; + ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, + rc->block_rsv->size, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) { + err = ret; + break; + } + progress++; + trans = btrfs_start_transaction(rc->extent_root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + trans = NULL; + break; + } +restart: + if (update_backref_cache(trans, &rc->backref_cache)) { + btrfs_end_transaction(trans); + trans = NULL; + continue; + } + + ret = find_next_extent(rc, path, &key); + if (ret < 0) + err = ret; + if (ret != 0) + break; + + rc->extents_found++; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + flags = btrfs_extent_flags(path->nodes[0], ei); + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ret = add_tree_block(rc, &key, path, &blocks); + } else if (rc->stage == UPDATE_DATA_PTRS && + (flags & BTRFS_EXTENT_FLAG_DATA)) { + ret = add_data_references(rc, &key, path, &blocks); + } else { + btrfs_release_path(path); + ret = 0; + } + if (ret < 0) { + err = ret; + break; + } + + if (!RB_EMPTY_ROOT(&blocks)) { + ret = relocate_tree_blocks(trans, rc, &blocks); + if (ret < 0) { + if (ret != -EAGAIN) { + err = ret; + break; + } + rc->extents_found--; + rc->search_start = key.objectid; + } + } + + btrfs_end_transaction_throttle(trans); + btrfs_btree_balance_dirty(fs_info); + trans = NULL; + + if (rc->stage == MOVE_DATA_EXTENTS && + (flags & BTRFS_EXTENT_FLAG_DATA)) { + rc->found_file_extent = 1; + ret = relocate_data_extent(rc->data_inode, + &key, &rc->cluster); + if (ret < 0) { + err = ret; + break; + } + } + if (btrfs_should_cancel_balance(fs_info)) { + err = -ECANCELED; + break; + } + } + if (trans && progress && err == -ENOSPC) { + ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); + if (ret == 1) { + err = 0; + progress = 0; + goto restart; + } + } + + btrfs_release_path(path); + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); + + if (trans) { + btrfs_end_transaction_throttle(trans); + btrfs_btree_balance_dirty(fs_info); + } + + if (!err) { + ret = relocate_file_extent_cluster(rc->data_inode, + &rc->cluster); + if (ret < 0) + err = ret; + } + + rc->create_reloc_tree = 0; + set_reloc_control(rc); + + btrfs_backref_release_cache(&rc->backref_cache); + btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); + + /* + * Even in the case when the relocation is cancelled, we should all go + * through prepare_to_merge() and merge_reloc_roots(). + * + * For error (including cancelled balance), prepare_to_merge() will + * mark all reloc trees orphan, then queue them for cleanup in + * merge_reloc_roots() + */ + err = prepare_to_merge(rc, err); + + merge_reloc_roots(rc); + + rc->merge_reloc_tree = 0; + unset_reloc_control(rc); + btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); + + /* get rid of pinned extents */ + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } + ret = btrfs_commit_transaction(trans); + if (ret && !err) + err = ret; +out_free: + ret = clean_dirty_subvols(rc); + if (ret < 0 && !err) + err = ret; + btrfs_free_block_rsv(fs_info, rc->block_rsv); + btrfs_free_path(path); + return err; +} + +static int __insert_orphan_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid) +{ + struct btrfs_path *path; + struct btrfs_inode_item *item; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_inode(trans, root, path, objectid); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); + memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); + btrfs_set_inode_generation(leaf, item, 1); + btrfs_set_inode_size(leaf, item, 0); + btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | + BTRFS_INODE_PREALLOC); + btrfs_mark_buffer_dirty(trans, leaf); +out: + btrfs_free_path(path); + return ret; +} + +static void delete_orphan_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto out; + } + ret = btrfs_del_item(trans, root, path); +out: + if (ret) + btrfs_abort_transaction(trans, ret); + btrfs_free_path(path); +} + +/* + * helper to create inode for data relocation. + * the inode is in data relocation tree and its link count is 0 + */ +static noinline_for_stack +struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, + struct btrfs_block_group *group) +{ + struct inode *inode = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root; + u64 objectid; + int err = 0; + + root = btrfs_grab_root(fs_info->data_reloc_root); + trans = btrfs_start_transaction(root, 6); + if (IS_ERR(trans)) { + btrfs_put_root(root); + return ERR_CAST(trans); + } + + err = btrfs_get_free_objectid(root, &objectid); + if (err) + goto out; + + err = __insert_orphan_inode(trans, root, objectid); + if (err) + goto out; + + inode = btrfs_iget(fs_info->sb, objectid, root); + if (IS_ERR(inode)) { + delete_orphan_inode(trans, root, objectid); + err = PTR_ERR(inode); + inode = NULL; + goto out; + } + BTRFS_I(inode)->index_cnt = group->start; + + err = btrfs_orphan_add(trans, BTRFS_I(inode)); +out: + btrfs_put_root(root); + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); + if (err) { + iput(inode); + inode = ERR_PTR(err); + } + return inode; +} + +/* + * Mark start of chunk relocation that is cancellable. Check if the cancellation + * has been requested meanwhile and don't start in that case. + * + * Return: + * 0 success + * -EINPROGRESS operation is already in progress, that's probably a bug + * -ECANCELED cancellation request was set before the operation started + */ +static int reloc_chunk_start(struct btrfs_fs_info *fs_info) +{ + if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { + /* This should not happen */ + btrfs_err(fs_info, "reloc already running, cannot start"); + return -EINPROGRESS; + } + + if (atomic_read(&fs_info->reloc_cancel_req) > 0) { + btrfs_info(fs_info, "chunk relocation canceled on start"); + /* + * On cancel, clear all requests but let the caller mark + * the end after cleanup operations. + */ + atomic_set(&fs_info->reloc_cancel_req, 0); + return -ECANCELED; + } + return 0; +} + +/* + * Mark end of chunk relocation that is cancellable and wake any waiters. + */ +static void reloc_chunk_end(struct btrfs_fs_info *fs_info) +{ + /* Requested after start, clear bit first so any waiters can continue */ + if (atomic_read(&fs_info->reloc_cancel_req) > 0) + btrfs_info(fs_info, "chunk relocation canceled during operation"); + clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); + atomic_set(&fs_info->reloc_cancel_req, 0); +} + +static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) +{ + struct reloc_control *rc; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) + return NULL; + + INIT_LIST_HEAD(&rc->reloc_roots); + INIT_LIST_HEAD(&rc->dirty_subvol_roots); + btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1); + mapping_tree_init(&rc->reloc_root_tree); + extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); + return rc; +} + +static void free_reloc_control(struct reloc_control *rc) +{ + struct mapping_node *node, *tmp; + + free_reloc_roots(&rc->reloc_roots); + rbtree_postorder_for_each_entry_safe(node, tmp, + &rc->reloc_root_tree.rb_root, rb_node) + kfree(node); + + kfree(rc); +} + +/* + * Print the block group being relocated + */ +static void describe_relocation(struct btrfs_fs_info *fs_info, + struct btrfs_block_group *block_group) +{ + char buf[128] = {'\0'}; + + btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); + + btrfs_info(fs_info, + "relocating block group %llu flags %s", + block_group->start, buf); +} + +static const char *stage_to_string(int stage) +{ + if (stage == MOVE_DATA_EXTENTS) + return "move data extents"; + if (stage == UPDATE_DATA_PTRS) + return "update data pointers"; + return "unknown"; +} + +/* + * function to relocate all extents in a block group. + */ +int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) +{ + struct btrfs_block_group *bg; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); + struct reloc_control *rc; + struct inode *inode; + struct btrfs_path *path; + int ret; + int rw = 0; + int err = 0; + + /* + * This only gets set if we had a half-deleted snapshot on mount. We + * cannot allow relocation to start while we're still trying to clean up + * these pending deletions. + */ + ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); + if (ret) + return ret; + + /* We may have been woken up by close_ctree, so bail if we're closing. */ + if (btrfs_fs_closing(fs_info)) + return -EINTR; + + bg = btrfs_lookup_block_group(fs_info, group_start); + if (!bg) + return -ENOENT; + + /* + * Relocation of a data block group creates ordered extents. Without + * sb_start_write(), we can freeze the filesystem while unfinished + * ordered extents are left. Such ordered extents can cause a deadlock + * e.g. when syncfs() is waiting for their completion but they can't + * finish because they block when joining a transaction, due to the + * fact that the freeze locks are being held in write mode. + */ + if (bg->flags & BTRFS_BLOCK_GROUP_DATA) + ASSERT(sb_write_started(fs_info->sb)); + + if (btrfs_pinned_by_swapfile(fs_info, bg)) { + btrfs_put_block_group(bg); + return -ETXTBSY; + } + + rc = alloc_reloc_control(fs_info); + if (!rc) { + btrfs_put_block_group(bg); + return -ENOMEM; + } + + ret = reloc_chunk_start(fs_info); + if (ret < 0) { + err = ret; + goto out_put_bg; + } + + rc->extent_root = extent_root; + rc->block_group = bg; + + ret = btrfs_inc_block_group_ro(rc->block_group, true); + if (ret) { + err = ret; + goto out; + } + rw = 1; + + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } + + inode = lookup_free_space_inode(rc->block_group, path); + btrfs_free_path(path); + + if (!IS_ERR(inode)) + ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); + else + ret = PTR_ERR(inode); + + if (ret && ret != -ENOENT) { + err = ret; + goto out; + } + + rc->data_inode = create_reloc_inode(fs_info, rc->block_group); + if (IS_ERR(rc->data_inode)) { + err = PTR_ERR(rc->data_inode); + rc->data_inode = NULL; + goto out; + } + + describe_relocation(fs_info, rc->block_group); + + btrfs_wait_block_group_reservations(rc->block_group); + btrfs_wait_nocow_writers(rc->block_group); + btrfs_wait_ordered_roots(fs_info, U64_MAX, + rc->block_group->start, + rc->block_group->length); + + ret = btrfs_zone_finish(rc->block_group); + WARN_ON(ret && ret != -EAGAIN); + + while (1) { + int finishes_stage; + + mutex_lock(&fs_info->cleaner_mutex); + ret = relocate_block_group(rc); + mutex_unlock(&fs_info->cleaner_mutex); + if (ret < 0) + err = ret; + + finishes_stage = rc->stage; + /* + * We may have gotten ENOSPC after we already dirtied some + * extents. If writeout happens while we're relocating a + * different block group we could end up hitting the + * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in + * btrfs_reloc_cow_block. Make sure we write everything out + * properly so we don't trip over this problem, and then break + * out of the loop if we hit an error. + */ + if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { + ret = btrfs_wait_ordered_range(rc->data_inode, 0, + (u64)-1); + if (ret) + err = ret; + invalidate_mapping_pages(rc->data_inode->i_mapping, + 0, -1); + rc->stage = UPDATE_DATA_PTRS; + } + + if (err < 0) + goto out; + + if (rc->extents_found == 0) + break; + + btrfs_info(fs_info, "found %llu extents, stage: %s", + rc->extents_found, stage_to_string(finishes_stage)); + } + + WARN_ON(rc->block_group->pinned > 0); + WARN_ON(rc->block_group->reserved > 0); + WARN_ON(rc->block_group->used > 0); +out: + if (err && rw) + btrfs_dec_block_group_ro(rc->block_group); + iput(rc->data_inode); +out_put_bg: + btrfs_put_block_group(bg); + reloc_chunk_end(fs_info); + free_reloc_control(rc); + return err; +} + +static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_trans_handle *trans; + int ret, err; + + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + memset(&root->root_item.drop_progress, 0, + sizeof(root->root_item.drop_progress)); + btrfs_set_root_drop_level(&root->root_item, 0); + btrfs_set_root_refs(&root->root_item, 0); + ret = btrfs_update_root(trans, fs_info->tree_root, + &root->root_key, &root->root_item); + + err = btrfs_end_transaction(trans); + if (err) + return err; + return ret; +} + +/* + * recover relocation interrupted by system crash. + * + * this function resumes merging reloc trees with corresponding fs trees. + * this is important for keeping the sharing of tree blocks + */ +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) +{ + LIST_HEAD(reloc_roots); + struct btrfs_key key; + struct btrfs_root *fs_root; + struct btrfs_root *reloc_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct reloc_control *rc = NULL; + struct btrfs_trans_handle *trans; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = READA_BACK; + + key.objectid = BTRFS_TREE_RELOC_OBJECTID; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, + path, 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(path); + + if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || + key.type != BTRFS_ROOT_ITEM_KEY) + break; + + reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); + if (IS_ERR(reloc_root)) { + err = PTR_ERR(reloc_root); + goto out; + } + + set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); + list_add(&reloc_root->root_list, &reloc_roots); + + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + fs_root = btrfs_get_fs_root(fs_info, + reloc_root->root_key.offset, false); + if (IS_ERR(fs_root)) { + ret = PTR_ERR(fs_root); + if (ret != -ENOENT) { + err = ret; + goto out; + } + ret = mark_garbage_root(reloc_root); + if (ret < 0) { + err = ret; + goto out; + } + } else { + btrfs_put_root(fs_root); + } + } + + if (key.offset == 0) + break; + + key.offset--; + } + btrfs_release_path(path); + + if (list_empty(&reloc_roots)) + goto out; + + rc = alloc_reloc_control(fs_info); + if (!rc) { + err = -ENOMEM; + goto out; + } + + ret = reloc_chunk_start(fs_info); + if (ret < 0) { + err = ret; + goto out_end; + } + + rc->extent_root = btrfs_extent_root(fs_info, 0); + + set_reloc_control(rc); + + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_unset; + } + + rc->merge_reloc_tree = 1; + + while (!list_empty(&reloc_roots)) { + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); + list_del(&reloc_root->root_list); + + if (btrfs_root_refs(&reloc_root->root_item) == 0) { + list_add_tail(&reloc_root->root_list, + &rc->reloc_roots); + continue; + } + + fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, + false); + if (IS_ERR(fs_root)) { + err = PTR_ERR(fs_root); + list_add_tail(&reloc_root->root_list, &reloc_roots); + btrfs_end_transaction(trans); + goto out_unset; + } + + err = __add_reloc_root(reloc_root); + ASSERT(err != -EEXIST); + if (err) { + list_add_tail(&reloc_root->root_list, &reloc_roots); + btrfs_put_root(fs_root); + btrfs_end_transaction(trans); + goto out_unset; + } + fs_root->reloc_root = btrfs_grab_root(reloc_root); + btrfs_put_root(fs_root); + } + + err = btrfs_commit_transaction(trans); + if (err) + goto out_unset; + + merge_reloc_roots(rc); + + unset_reloc_control(rc); + + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_clean; + } + err = btrfs_commit_transaction(trans); +out_clean: + ret = clean_dirty_subvols(rc); + if (ret < 0 && !err) + err = ret; +out_unset: + unset_reloc_control(rc); +out_end: + reloc_chunk_end(fs_info); + free_reloc_control(rc); +out: + free_reloc_roots(&reloc_roots); + + btrfs_free_path(path); + + if (err == 0) { + /* cleanup orphan inode in data relocation tree */ + fs_root = btrfs_grab_root(fs_info->data_reloc_root); + ASSERT(fs_root); + err = btrfs_orphan_cleanup(fs_root); + btrfs_put_root(fs_root); + } + return err; +} + +/* + * helper to add ordered checksum for data relocation. + * + * cloning checksum properly handles the nodatasum extents. + * it also saves CPU time to re-calculate the checksum. + */ +int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 disk_bytenr = ordered->file_offset + inode->index_cnt; + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr); + LIST_HEAD(list); + int ret; + + ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, + disk_bytenr + ordered->num_bytes - 1, + &list, 0, false); + if (ret) + return ret; + + while (!list_empty(&list)) { + struct btrfs_ordered_sum *sums = + list_entry(list.next, struct btrfs_ordered_sum, list); + + list_del_init(&sums->list); + + /* + * We need to offset the new_bytenr based on where the csum is. + * We need to do this because we will read in entire prealloc + * extents but we may have written to say the middle of the + * prealloc extent, so we need to make sure the csum goes with + * the right disk offset. + * + * We can do this because the data reloc inode refers strictly + * to the on disk bytes, so we don't have to worry about + * disk_len vs real len like with real inodes since it's all + * disk length. + */ + sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr; + btrfs_add_ordered_sum(ordered, sums); + } + + return 0; +} + +int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct reloc_control *rc; + struct btrfs_backref_node *node; + int first_cow = 0; + int level; + int ret = 0; + + rc = fs_info->reloc_ctl; + if (!rc) + return 0; + + BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); + + level = btrfs_header_level(buf); + if (btrfs_header_generation(buf) <= + btrfs_root_last_snapshot(&root->root_item)) + first_cow = 1; + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && + rc->create_reloc_tree) { + WARN_ON(!first_cow && level == 0); + + node = rc->backref_cache.path[level]; + BUG_ON(node->bytenr != buf->start && + node->new_bytenr != buf->start); + + btrfs_backref_drop_node_buffer(node); + atomic_inc(&cow->refs); + node->eb = cow; + node->new_bytenr = cow->start; + + if (!node->pending) { + list_move_tail(&node->list, + &rc->backref_cache.pending[level]); + node->pending = 1; + } + + if (first_cow) + mark_block_processed(rc, node); + + if (first_cow && level > 0) + rc->nodes_relocated += buf->len; + } + + if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) + ret = replace_file_extents(trans, rc, root, cow); + return ret; +} + +/* + * called before creating snapshot. it calculates metadata reservation + * required for relocating tree blocks in the snapshot + */ +void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve) +{ + struct btrfs_root *root = pending->root; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + if (!rc || !have_reloc_root(root)) + return; + + if (!rc->merge_reloc_tree) + return; + + root = root->reloc_root; + BUG_ON(btrfs_root_refs(&root->root_item) == 0); + /* + * relocation is in the stage of merging trees. the space + * used by merging a reloc tree is twice the size of + * relocated tree nodes in the worst case. half for cowing + * the reloc tree, half for cowing the fs tree. the space + * used by cowing the reloc tree will be freed after the + * tree is dropped. if we create snapshot, cowing the fs + * tree may use more space than it frees. so we need + * reserve extra space. + */ + *bytes_to_reserve += rc->nodes_relocated; +} + +/* + * called after snapshot is created. migrate block reservation + * and create reloc root for the newly created snapshot + * + * This is similar to btrfs_init_reloc_root(), we come out of here with two + * references held on the reloc_root, one for root->reloc_root and one for + * rc->reloc_roots. + */ +int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_root *root = pending->root; + struct btrfs_root *reloc_root; + struct btrfs_root *new_root; + struct reloc_control *rc = root->fs_info->reloc_ctl; + int ret; + + if (!rc || !have_reloc_root(root)) + return 0; + + rc = root->fs_info->reloc_ctl; + rc->merging_rsv_size += rc->nodes_relocated; + + if (rc->merge_reloc_tree) { + ret = btrfs_block_rsv_migrate(&pending->block_rsv, + rc->block_rsv, + rc->nodes_relocated, true); + if (ret) + return ret; + } + + new_root = pending->snap; + reloc_root = create_reloc_root(trans, root->reloc_root, + new_root->root_key.objectid); + if (IS_ERR(reloc_root)) + return PTR_ERR(reloc_root); + + ret = __add_reloc_root(reloc_root); + ASSERT(ret != -EEXIST); + if (ret) { + /* Pairs with create_reloc_root */ + btrfs_put_root(reloc_root); + return ret; + } + new_root->reloc_root = btrfs_grab_root(reloc_root); + + if (rc->create_reloc_tree) + ret = clone_backref_node(trans, rc, root, reloc_root); + return ret; +} + +/* + * Get the current bytenr for the block group which is being relocated. + * + * Return U64_MAX if no running relocation. + */ +u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info) +{ + u64 logical = U64_MAX; + + lockdep_assert_held(&fs_info->reloc_mutex); + + if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group) + logical = fs_info->reloc_ctl->block_group->start; + return logical; +} diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h new file mode 100644 index 0000000000..77d69f6ae9 --- /dev/null +++ b/fs/btrfs/relocation.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_RELOCATION_H +#define BTRFS_RELOCATION_H + +int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); +int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered); +int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow); +void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve); +int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); +int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); +struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr); +int btrfs_should_ignore_reloc_root(struct btrfs_root *root); +u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info); + +#endif diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c new file mode 100644 index 0000000000..5b0f1bccc4 --- /dev/null +++ b/fs/btrfs/root-tree.c @@ -0,0 +1,549 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include +#include +#include "ctree.h" +#include "fs.h" +#include "messages.h" +#include "transaction.h" +#include "disk-io.h" +#include "print-tree.h" +#include "qgroup.h" +#include "space-info.h" +#include "accessors.h" +#include "root-tree.h" +#include "orphan.h" + +/* + * Read a root item from the tree. In case we detect a root item smaller then + * sizeof(root_item), we know it's an old version of the root structure and + * initialize all new fields to zero. The same happens if we detect mismatching + * generation numbers as then we know the root was once mounted with an older + * kernel that was not aware of the root item structure change. + */ +static void btrfs_read_root_item(struct extent_buffer *eb, int slot, + struct btrfs_root_item *item) +{ + u32 len; + int need_reset = 0; + + len = btrfs_item_size(eb, slot); + read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot), + min_t(u32, len, sizeof(*item))); + if (len < sizeof(*item)) + need_reset = 1; + if (!need_reset && btrfs_root_generation(item) + != btrfs_root_generation_v2(item)) { + if (btrfs_root_generation_v2(item) != 0) { + btrfs_warn(eb->fs_info, + "mismatching generation and generation_v2 found in root item. This root was probably mounted with an older kernel. Resetting all new fields."); + } + need_reset = 1; + } + if (need_reset) { + /* Clear all members from generation_v2 onwards. */ + memset_startat(item, 0, generation_v2); + generate_random_guid(item->uuid); + } +} + +/* + * btrfs_find_root - lookup the root by the key. + * root: the root of the root tree + * search_key: the key to search + * path: the path we search + * root_item: the root item of the tree we look for + * root_key: the root key of the tree we look for + * + * If ->offset of 'search_key' is -1ULL, it means we are not sure the offset + * of the search key, just lookup the root with the highest offset for a + * given objectid. + * + * If we find something return 0, otherwise > 0, < 0 on error. + */ +int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, + struct btrfs_path *path, struct btrfs_root_item *root_item, + struct btrfs_key *root_key) +{ + struct btrfs_key found_key; + struct extent_buffer *l; + int ret; + int slot; + + ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0); + if (ret < 0) + return ret; + + if (search_key->offset != -1ULL) { /* the search key is exact */ + if (ret > 0) + goto out; + } else { + BUG_ON(ret == 0); /* Logical error */ + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + ret = 0; + } + + l = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(l, &found_key, slot); + if (found_key.objectid != search_key->objectid || + found_key.type != BTRFS_ROOT_ITEM_KEY) { + ret = 1; + goto out; + } + + if (root_item) + btrfs_read_root_item(l, slot, root_item); + if (root_key) + memcpy(root_key, &found_key, sizeof(found_key)); +out: + btrfs_release_path(path); + return ret; +} + +void btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node) +{ + btrfs_set_root_bytenr(item, node->start); + btrfs_set_root_level(item, btrfs_header_level(node)); + btrfs_set_root_generation(item, btrfs_header_generation(node)); +} + +/* + * copy the data in 'item' into the btree + */ +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + struct extent_buffer *l; + int ret; + int slot; + unsigned long ptr; + u32 old_len; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + btrfs_crit(fs_info, + "unable to find root key (%llu %u %llu) in tree %llu", + key->objectid, key->type, key->offset, + root->root_key.objectid); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + goto out; + } + + l = path->nodes[0]; + slot = path->slots[0]; + ptr = btrfs_item_ptr_offset(l, slot); + old_len = btrfs_item_size(l, slot); + + /* + * If this is the first time we update the root item which originated + * from an older kernel, we need to enlarge the item size to make room + * for the added fields. + */ + if (old_len < sizeof(*item)) { + btrfs_release_path(path); + ret = btrfs_search_slot(trans, root, key, path, + -1, 1); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = btrfs_del_item(trans, root, path); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, root, path, + key, sizeof(*item)); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } + l = path->nodes[0]; + slot = path->slots[0]; + ptr = btrfs_item_ptr_offset(l, slot); + } + + /* + * Update generation_v2 so at the next mount we know the new root + * fields are valid. + */ + btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); + + write_extent_buffer(l, item, ptr, sizeof(*item)); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *key, struct btrfs_root_item *item) +{ + /* + * Make sure generation v1 and v2 match. See update_root for details. + */ + btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); + return btrfs_insert_item(trans, root, key, item, sizeof(*item)); +} + +int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *tree_root = fs_info->tree_root; + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_root *root; + int err = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = 0; + + while (1) { + u64 root_objectid; + + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + if (ret < 0) { + err = ret; + break; + } + + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(tree_root, path); + if (ret < 0) + err = ret; + if (ret != 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(path); + + if (key.objectid != BTRFS_ORPHAN_OBJECTID || + key.type != BTRFS_ORPHAN_ITEM_KEY) + break; + + root_objectid = key.offset; + key.offset++; + + root = btrfs_get_fs_root(fs_info, root_objectid, false); + err = PTR_ERR_OR_ZERO(root); + if (err && err != -ENOENT) { + break; + } else if (err == -ENOENT) { + struct btrfs_trans_handle *trans; + + btrfs_release_path(path); + + trans = btrfs_join_transaction(tree_root); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + btrfs_handle_fs_error(fs_info, err, + "Failed to start trans to delete orphan item"); + break; + } + err = btrfs_del_orphan_item(trans, tree_root, + root_objectid); + btrfs_end_transaction(trans); + if (err) { + btrfs_handle_fs_error(fs_info, err, + "Failed to delete root orphan item"); + break; + } + continue; + } + + WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)); + if (btrfs_root_refs(&root->root_item) == 0) { + struct btrfs_key drop_key; + + btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress); + /* + * If we have a non-zero drop_progress then we know we + * made it partly through deleting this snapshot, and + * thus we need to make sure we block any balance from + * happening until this snapshot is completely dropped. + */ + if (drop_key.objectid != 0 || drop_key.type != 0 || + drop_key.offset != 0) { + set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); + set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); + } + + set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); + btrfs_add_dead_root(root); + } + btrfs_put_root(root); + } + + btrfs_free_path(path); + return err; +} + +/* drop the root item for 'key' from the tree root */ +int btrfs_del_root(struct btrfs_trans_handle *trans, + const struct btrfs_key *key) +{ + struct btrfs_root *root = trans->fs_info->tree_root; + struct btrfs_path *path; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, root, key, path, -1, 1); + if (ret < 0) + goto out; + + BUG_ON(ret != 0); + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, + u64 ref_id, u64 dirid, u64 *sequence, + const struct fscrypt_str *name) +{ + struct btrfs_root *tree_root = trans->fs_info->tree_root; + struct btrfs_path *path; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long ptr; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = root_id; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = ref_id; +again: + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); + if (ret < 0) { + goto out; + } else if (ret == 0) { + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + ptr = (unsigned long)(ref + 1); + if ((btrfs_root_ref_dirid(leaf, ref) != dirid) || + (btrfs_root_ref_name_len(leaf, ref) != name->len) || + memcmp_extent_buffer(leaf, name->name, ptr, name->len)) { + ret = -ENOENT; + goto out; + } + *sequence = btrfs_root_ref_sequence(leaf, ref); + + ret = btrfs_del_item(trans, tree_root, path); + if (ret) + goto out; + } else { + ret = -ENOENT; + goto out; + } + + if (key.type == BTRFS_ROOT_BACKREF_KEY) { + btrfs_release_path(path); + key.objectid = ref_id; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = root_id; + goto again; + } + +out: + btrfs_free_path(path); + return ret; +} + +/* + * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY + * or BTRFS_ROOT_BACKREF_KEY. + * + * The dirid, sequence, name and name_len refer to the directory entry + * that is referencing the root. + * + * For a forward ref, the root_id is the id of the tree referencing + * the root and ref_id is the id of the subvol or snapshot. + * + * For a back ref the root_id is the id of the subvol or snapshot and + * ref_id is the id of the tree referencing it. + * + * Will return 0, -ENOMEM, or anything from the CoW path + */ +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, + u64 ref_id, u64 dirid, u64 sequence, + const struct fscrypt_str *name) +{ + struct btrfs_root *tree_root = trans->fs_info->tree_root; + struct btrfs_key key; + int ret; + struct btrfs_path *path; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + unsigned long ptr; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = root_id; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = ref_id; +again: + ret = btrfs_insert_empty_item(trans, tree_root, path, &key, + sizeof(*ref) + name->len); + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_free_path(path); + return ret; + } + + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); + btrfs_set_root_ref_dirid(leaf, ref, dirid); + btrfs_set_root_ref_sequence(leaf, ref, sequence); + btrfs_set_root_ref_name_len(leaf, ref, name->len); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(leaf, name->name, ptr, name->len); + btrfs_mark_buffer_dirty(trans, leaf); + + if (key.type == BTRFS_ROOT_BACKREF_KEY) { + btrfs_release_path(path); + key.objectid = ref_id; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = root_id; + goto again; + } + + btrfs_free_path(path); + return 0; +} + +/* + * Old btrfs forgets to init root_item->flags and root_item->byte_limit + * for subvolumes. To work around this problem, we steal a bit from + * root_item->inode_item->flags, and use it to indicate if those fields + * have been properly initialized. + */ +void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item) +{ + u64 inode_flags = btrfs_stack_inode_flags(&root_item->inode); + + if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) { + inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT; + btrfs_set_stack_inode_flags(&root_item->inode, inode_flags); + btrfs_set_root_flags(root_item, 0); + btrfs_set_root_limit(root_item, 0); + } +} + +void btrfs_update_root_times(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root_item *item = &root->root_item; + struct timespec64 ct; + + ktime_get_real_ts64(&ct); + spin_lock(&root->root_item_lock); + btrfs_set_root_ctransid(item, trans->transid); + btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec); + btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec); + spin_unlock(&root->root_item_lock); +} + +/* + * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * root: the root of the parent directory + * rsv: block reservation + * items: the number of items that we need do reservation + * use_global_rsv: allow fallback to the global block reservation + * + * This function is used to reserve the space for snapshot/subvolume + * creation and deletion. Those operations are different with the + * common file/directory operations, they change two fs/file trees + * and root tree, the number of items that the qgroup reserves is + * different with the free space reservation. So we can not use + * the space reservation mechanism in start_transaction(). + */ +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, int items, + bool use_global_rsv) +{ + u64 qgroup_num_bytes = 0; + u64 num_bytes; + int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + + if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + /* One for parent inode, two for dir entries */ + qgroup_num_bytes = 3 * fs_info->nodesize; + ret = btrfs_qgroup_reserve_meta_prealloc(root, + qgroup_num_bytes, true, + false); + if (ret) + return ret; + } + + num_bytes = btrfs_calc_insert_metadata_size(fs_info, items); + rsv->space_info = btrfs_find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); + ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + + if (ret == -ENOSPC && use_global_rsv) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); + + if (ret && qgroup_num_bytes) + btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + + if (!ret) { + spin_lock(&rsv->lock); + rsv->qgroup_rsv_reserved += qgroup_num_bytes; + spin_unlock(&rsv->lock); + } + return ret; +} + +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 qgroup_to_release; + + btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release); + btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release); +} diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h new file mode 100644 index 0000000000..cbbaca3212 --- /dev/null +++ b/fs/btrfs/root-tree.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ROOT_TREE_H +#define BTRFS_ROOT_TREE_H + +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int nitems, bool use_global_rsv); +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, + u64 ref_id, u64 dirid, u64 sequence, + const struct fscrypt_str *name); +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, + u64 ref_id, u64 dirid, u64 *sequence, + const struct fscrypt_str *name); +int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key); +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *key, + struct btrfs_root_item *item); +int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, + struct btrfs_root_item *item); +int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, + struct btrfs_path *path, struct btrfs_root_item *root_item, + struct btrfs_key *root_key); +int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info); +void btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node); +void btrfs_check_and_init_root_item(struct btrfs_root_item *item); +void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root); + +#endif diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c new file mode 100644 index 0000000000..1e3ff87d04 --- /dev/null +++ b/fs/btrfs/scrub.c @@ -0,0 +1,3062 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2011, 2012 STRATO. All rights reserved. + */ + +#include +#include +#include +#include +#include "ctree.h" +#include "discard.h" +#include "volumes.h" +#include "disk-io.h" +#include "ordered-data.h" +#include "transaction.h" +#include "backref.h" +#include "extent_io.h" +#include "dev-replace.h" +#include "check-integrity.h" +#include "raid56.h" +#include "block-group.h" +#include "zoned.h" +#include "fs.h" +#include "accessors.h" +#include "file-item.h" +#include "scrub.h" + +/* + * This is only the first step towards a full-features scrub. It reads all + * extent and super block and verifies the checksums. In case a bad checksum + * is found or the extent cannot be read, good data will be written back if + * any can be found. + * + * Future enhancements: + * - In case an unrepairable extent is encountered, track which files are + * affected and report them + * - track and record media errors, throw out bad devices + * - add a mode to also read unallocated space + */ + +struct scrub_ctx; + +/* + * The following value only influences the performance. + * + * This detemines how many stripes would be submitted in one go, + * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP). + */ +#define SCRUB_STRIPES_PER_GROUP 8 + +/* + * How many groups we have for each sctx. + * + * This would be 8M per device, the same value as the old scrub in-flight bios + * size limit. + */ +#define SCRUB_GROUPS_PER_SCTX 16 + +#define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP) + +/* + * The following value times PAGE_SIZE needs to be large enough to match the + * largest node/leaf/sector size that shall be supported. + */ +#define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) + +/* Represent one sector and its needed info to verify the content. */ +struct scrub_sector_verification { + bool is_metadata; + + union { + /* + * Csum pointer for data csum verification. Should point to a + * sector csum inside scrub_stripe::csums. + * + * NULL if this data sector has no csum. + */ + u8 *csum; + + /* + * Extra info for metadata verification. All sectors inside a + * tree block share the same generation. + */ + u64 generation; + }; +}; + +enum scrub_stripe_flags { + /* Set when @mirror_num, @dev, @physical and @logical are set. */ + SCRUB_STRIPE_FLAG_INITIALIZED, + + /* Set when the read-repair is finished. */ + SCRUB_STRIPE_FLAG_REPAIR_DONE, + + /* + * Set for data stripes if it's triggered from P/Q stripe. + * During such scrub, we should not report errors in data stripes, nor + * update the accounting. + */ + SCRUB_STRIPE_FLAG_NO_REPORT, +}; + +#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE) + +/* + * Represent one contiguous range with a length of BTRFS_STRIPE_LEN. + */ +struct scrub_stripe { + struct scrub_ctx *sctx; + struct btrfs_block_group *bg; + + struct page *pages[SCRUB_STRIPE_PAGES]; + struct scrub_sector_verification *sectors; + + struct btrfs_device *dev; + u64 logical; + u64 physical; + + u16 mirror_num; + + /* Should be BTRFS_STRIPE_LEN / sectorsize. */ + u16 nr_sectors; + + /* + * How many data/meta extents are in this stripe. Only for scrub status + * reporting purposes. + */ + u16 nr_data_extents; + u16 nr_meta_extents; + + atomic_t pending_io; + wait_queue_head_t io_wait; + wait_queue_head_t repair_wait; + + /* + * Indicate the states of the stripe. Bits are defined in + * scrub_stripe_flags enum. + */ + unsigned long state; + + /* Indicate which sectors are covered by extent items. */ + unsigned long extent_sector_bitmap; + + /* + * The errors hit during the initial read of the stripe. + * + * Would be utilized for error reporting and repair. + * + * The remaining init_nr_* records the number of errors hit, only used + * by error reporting. + */ + unsigned long init_error_bitmap; + unsigned int init_nr_io_errors; + unsigned int init_nr_csum_errors; + unsigned int init_nr_meta_errors; + + /* + * The following error bitmaps are all for the current status. + * Every time we submit a new read, these bitmaps may be updated. + * + * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap; + * + * IO and csum errors can happen for both metadata and data. + */ + unsigned long error_bitmap; + unsigned long io_error_bitmap; + unsigned long csum_error_bitmap; + unsigned long meta_error_bitmap; + + /* For writeback (repair or replace) error reporting. */ + unsigned long write_error_bitmap; + + /* Writeback can be concurrent, thus we need to protect the bitmap. */ + spinlock_t write_error_lock; + + /* + * Checksum for the whole stripe if this stripe is inside a data block + * group. + */ + u8 *csums; + + struct work_struct work; +}; + +struct scrub_ctx { + struct scrub_stripe stripes[SCRUB_TOTAL_STRIPES]; + struct scrub_stripe *raid56_data_stripes; + struct btrfs_fs_info *fs_info; + struct btrfs_path extent_path; + struct btrfs_path csum_path; + int first_free; + int cur_stripe; + atomic_t cancel_req; + int readonly; + int sectors_per_bio; + + /* State of IO submission throttling affecting the associated device */ + ktime_t throttle_deadline; + u64 throttle_sent; + + int is_dev_replace; + u64 write_pointer; + + struct mutex wr_lock; + struct btrfs_device *wr_tgtdev; + + /* + * statistics + */ + struct btrfs_scrub_progress stat; + spinlock_t stat_lock; + + /* + * Use a ref counter to avoid use-after-free issues. Scrub workers + * decrement bios_in_flight and workers_pending and then do a wakeup + * on the list_wait wait queue. We must ensure the main scrub task + * doesn't free the scrub context before or while the workers are + * doing the wakeup() call. + */ + refcount_t refs; +}; + +struct scrub_warning { + struct btrfs_path *path; + u64 extent_item_size; + const char *errstr; + u64 physical; + u64 logical; + struct btrfs_device *dev; +}; + +static void release_scrub_stripe(struct scrub_stripe *stripe) +{ + if (!stripe) + return; + + for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) { + if (stripe->pages[i]) + __free_page(stripe->pages[i]); + stripe->pages[i] = NULL; + } + kfree(stripe->sectors); + kfree(stripe->csums); + stripe->sectors = NULL; + stripe->csums = NULL; + stripe->sctx = NULL; + stripe->state = 0; +} + +static int init_scrub_stripe(struct btrfs_fs_info *fs_info, + struct scrub_stripe *stripe) +{ + int ret; + + memset(stripe, 0, sizeof(*stripe)); + + stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; + stripe->state = 0; + + init_waitqueue_head(&stripe->io_wait); + init_waitqueue_head(&stripe->repair_wait); + atomic_set(&stripe->pending_io, 0); + spin_lock_init(&stripe->write_error_lock); + + ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages); + if (ret < 0) + goto error; + + stripe->sectors = kcalloc(stripe->nr_sectors, + sizeof(struct scrub_sector_verification), + GFP_KERNEL); + if (!stripe->sectors) + goto error; + + stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits, + fs_info->csum_size, GFP_KERNEL); + if (!stripe->csums) + goto error; + return 0; +error: + release_scrub_stripe(stripe); + return -ENOMEM; +} + +static void wait_scrub_stripe_io(struct scrub_stripe *stripe) +{ + wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0); +} + +static void scrub_put_ctx(struct scrub_ctx *sctx); + +static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ + while (atomic_read(&fs_info->scrub_pause_req)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrub_pause_req) == 0); + mutex_lock(&fs_info->scrub_lock); + } +} + +static void scrub_pause_on(struct btrfs_fs_info *fs_info) +{ + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); +} + +static void scrub_pause_off(struct btrfs_fs_info *fs_info) +{ + mutex_lock(&fs_info->scrub_lock); + __scrub_blocked_if_needed(fs_info); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + + wake_up(&fs_info->scrub_pause_wait); +} + +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ + scrub_pause_on(fs_info); + scrub_pause_off(fs_info); +} + +static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) +{ + int i; + + if (!sctx) + return; + + for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) + release_scrub_stripe(&sctx->stripes[i]); + + kvfree(sctx); +} + +static void scrub_put_ctx(struct scrub_ctx *sctx) +{ + if (refcount_dec_and_test(&sctx->refs)) + scrub_free_ctx(sctx); +} + +static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( + struct btrfs_fs_info *fs_info, int is_dev_replace) +{ + struct scrub_ctx *sctx; + int i; + + /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use + * kvzalloc(). + */ + sctx = kvzalloc(sizeof(*sctx), GFP_KERNEL); + if (!sctx) + goto nomem; + refcount_set(&sctx->refs, 1); + sctx->is_dev_replace = is_dev_replace; + sctx->fs_info = fs_info; + sctx->extent_path.search_commit_root = 1; + sctx->extent_path.skip_locking = 1; + sctx->csum_path.search_commit_root = 1; + sctx->csum_path.skip_locking = 1; + for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) { + int ret; + + ret = init_scrub_stripe(fs_info, &sctx->stripes[i]); + if (ret < 0) + goto nomem; + sctx->stripes[i].sctx = sctx; + } + sctx->first_free = 0; + atomic_set(&sctx->cancel_req, 0); + + spin_lock_init(&sctx->stat_lock); + sctx->throttle_deadline = 0; + + mutex_init(&sctx->wr_lock); + if (is_dev_replace) { + WARN_ON(!fs_info->dev_replace.tgtdev); + sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; + } + + return sctx; + +nomem: + scrub_free_ctx(sctx); + return ERR_PTR(-ENOMEM); +} + +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, + u64 root, void *warn_ctx) +{ + u32 nlink; + int ret; + int i; + unsigned nofs_flag; + struct extent_buffer *eb; + struct btrfs_inode_item *inode_item; + struct scrub_warning *swarn = warn_ctx; + struct btrfs_fs_info *fs_info = swarn->dev->fs_info; + struct inode_fs_paths *ipath = NULL; + struct btrfs_root *local_root; + struct btrfs_key key; + + local_root = btrfs_get_fs_root(fs_info, root, true); + if (IS_ERR(local_root)) { + ret = PTR_ERR(local_root); + goto err; + } + + /* + * this makes the path point to (inum INODE_ITEM ioff) + */ + key.objectid = inum; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); + if (ret) { + btrfs_put_root(local_root); + btrfs_release_path(swarn->path); + goto err; + } + + eb = swarn->path->nodes[0]; + inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], + struct btrfs_inode_item); + nlink = btrfs_inode_nlink(eb, inode_item); + btrfs_release_path(swarn->path); + + /* + * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub + * uses GFP_NOFS in this context, so we keep it consistent but it does + * not seem to be strictly necessary. + */ + nofs_flag = memalloc_nofs_save(); + ipath = init_ipath(4096, local_root, swarn->path); + memalloc_nofs_restore(nofs_flag); + if (IS_ERR(ipath)) { + btrfs_put_root(local_root); + ret = PTR_ERR(ipath); + ipath = NULL; + goto err; + } + ret = paths_from_inode(inum, ipath); + + if (ret < 0) + goto err; + + /* + * we deliberately ignore the bit ipath might have been too small to + * hold all of the paths here + */ + for (i = 0; i < ipath->fspath->elem_cnt; ++i) + btrfs_warn_in_rcu(fs_info, +"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", + swarn->errstr, swarn->logical, + btrfs_dev_name(swarn->dev), + swarn->physical, + root, inum, offset, + fs_info->sectorsize, nlink, + (char *)(unsigned long)ipath->fspath->val[i]); + + btrfs_put_root(local_root); + free_ipath(ipath); + return 0; + +err: + btrfs_warn_in_rcu(fs_info, + "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", + swarn->errstr, swarn->logical, + btrfs_dev_name(swarn->dev), + swarn->physical, + root, inum, offset, ret); + + free_ipath(ipath); + return 0; +} + +static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev, + bool is_super, u64 logical, u64 physical) +{ + struct btrfs_fs_info *fs_info = dev->fs_info; + struct btrfs_path *path; + struct btrfs_key found_key; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct scrub_warning swarn; + u64 flags = 0; + u32 item_size; + int ret; + + /* Super block error, no need to search extent tree. */ + if (is_super) { + btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", + errstr, btrfs_dev_name(dev), physical); + return; + } + path = btrfs_alloc_path(); + if (!path) + return; + + swarn.physical = physical; + swarn.logical = logical; + swarn.errstr = errstr; + swarn.dev = NULL; + + ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, + &flags); + if (ret < 0) + goto out; + + swarn.extent_item_size = found_key.offset; + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size(eb, path->slots[0]); + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + unsigned long ptr = 0; + u8 ref_level; + u64 ref_root; + + while (true) { + ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, + item_size, &ref_root, + &ref_level); + if (ret < 0) { + btrfs_warn(fs_info, + "failed to resolve tree backref for logical %llu: %d", + swarn.logical, ret); + break; + } + if (ret > 0) + break; + btrfs_warn_in_rcu(fs_info, +"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", + errstr, swarn.logical, btrfs_dev_name(dev), + swarn.physical, (ref_level ? "node" : "leaf"), + ref_level, ref_root); + } + btrfs_release_path(path); + } else { + struct btrfs_backref_walk_ctx ctx = { 0 }; + + btrfs_release_path(path); + + ctx.bytenr = found_key.objectid; + ctx.extent_item_pos = swarn.logical - found_key.objectid; + ctx.fs_info = fs_info; + + swarn.path = path; + swarn.dev = dev; + + iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); + } + +out: + btrfs_free_path(path); +} + +static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) +{ + int ret = 0; + u64 length; + + if (!btrfs_is_zoned(sctx->fs_info)) + return 0; + + if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) + return 0; + + if (sctx->write_pointer < physical) { + length = physical - sctx->write_pointer; + + ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev, + sctx->write_pointer, length); + if (!ret) + sctx->write_pointer = physical; + } + return ret; +} + +static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr) +{ + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT; + + return stripe->pages[page_index]; +} + +static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe, + int sector_nr) +{ + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + + return offset_in_page(sector_nr << fs_info->sectorsize_bits); +} + +static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) +{ + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; + const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); + const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr); + const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr); + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + u8 on_disk_csum[BTRFS_CSUM_SIZE]; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + struct btrfs_header *header; + + /* + * Here we don't have a good way to attach the pages (and subpages) + * to a dummy extent buffer, thus we have to directly grab the members + * from pages. + */ + header = (struct btrfs_header *)(page_address(first_page) + first_off); + memcpy(on_disk_csum, header->csum, fs_info->csum_size); + + if (logical != btrfs_stack_header_bytenr(header)) { + bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad bytenr, has %llu want %llu", + logical, stripe->mirror_num, + btrfs_stack_header_bytenr(header), logical); + return; + } + if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) != 0) { + bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad fsid, has %pU want %pU", + logical, stripe->mirror_num, + header->fsid, fs_info->fs_devices->fsid); + return; + } + if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid, + BTRFS_UUID_SIZE) != 0) { + bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", + logical, stripe->mirror_num, + header->chunk_tree_uuid, fs_info->chunk_tree_uuid); + return; + } + + /* Now check tree block csum. */ + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + crypto_shash_update(shash, page_address(first_page) + first_off + + BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE); + + for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { + struct page *page = scrub_stripe_get_page(stripe, i); + unsigned int page_off = scrub_stripe_get_page_offset(stripe, i); + + crypto_shash_update(shash, page_address(page) + page_off, + fs_info->sectorsize); + } + + crypto_shash_final(shash, calculated_csum); + if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { + bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, + logical, stripe->mirror_num, + CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), + CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); + return; + } + if (stripe->sectors[sector_nr].generation != + btrfs_stack_header_generation(header)) { + bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad generation, has %llu want %llu", + logical, stripe->mirror_num, + btrfs_stack_header_generation(header), + stripe->sectors[sector_nr].generation); + return; + } + bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree); + bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); + bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); +} + +static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) +{ + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; + const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; + struct page *page = scrub_stripe_get_page(stripe, sector_nr); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); + u8 csum_buf[BTRFS_CSUM_SIZE]; + int ret; + + ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors); + + /* Sector not utilized, skip it. */ + if (!test_bit(sector_nr, &stripe->extent_sector_bitmap)) + return; + + /* IO error, no need to check. */ + if (test_bit(sector_nr, &stripe->io_error_bitmap)) + return; + + /* Metadata, verify the full tree block. */ + if (sector->is_metadata) { + /* + * Check if the tree block crosses the stripe boudary. If + * crossed the boundary, we cannot verify it but only give a + * warning. + * + * This can only happen on a very old filesystem where chunks + * are not ensured to be stripe aligned. + */ + if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) { + btrfs_warn_rl(fs_info, + "tree block at %llu crosses stripe boundary %llu", + stripe->logical + + (sector_nr << fs_info->sectorsize_bits), + stripe->logical); + return; + } + scrub_verify_one_metadata(stripe, sector_nr); + return; + } + + /* + * Data is easier, we just verify the data csum (if we have it). For + * cases without csum, we have no other choice but to trust it. + */ + if (!sector->csum) { + clear_bit(sector_nr, &stripe->error_bitmap); + return; + } + + ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum); + if (ret < 0) { + set_bit(sector_nr, &stripe->csum_error_bitmap); + set_bit(sector_nr, &stripe->error_bitmap); + } else { + clear_bit(sector_nr, &stripe->csum_error_bitmap); + clear_bit(sector_nr, &stripe->error_bitmap); + } +} + +/* Verify specified sectors of a stripe. */ +static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap) +{ + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; + int sector_nr; + + for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) { + scrub_verify_one_sector(stripe, sector_nr); + if (stripe->sectors[sector_nr].is_metadata) + sector_nr += sectors_per_tree - 1; + } +} + +static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec) +{ + int i; + + for (i = 0; i < stripe->nr_sectors; i++) { + if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page && + scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset) + break; + } + ASSERT(i < stripe->nr_sectors); + return i; +} + +/* + * Repair read is different to the regular read: + * + * - Only reads the failed sectors + * - May have extra blocksize limits + */ +static void scrub_repair_read_endio(struct btrfs_bio *bbio) +{ + struct scrub_stripe *stripe = bbio->private; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct bio_vec *bvec; + int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); + u32 bio_size = 0; + int i; + + ASSERT(sector_nr < stripe->nr_sectors); + + bio_for_each_bvec_all(bvec, &bbio->bio, i) + bio_size += bvec->bv_len; + + if (bbio->bio.bi_status) { + bitmap_set(&stripe->io_error_bitmap, sector_nr, + bio_size >> fs_info->sectorsize_bits); + bitmap_set(&stripe->error_bitmap, sector_nr, + bio_size >> fs_info->sectorsize_bits); + } else { + bitmap_clear(&stripe->io_error_bitmap, sector_nr, + bio_size >> fs_info->sectorsize_bits); + } + bio_put(&bbio->bio); + if (atomic_dec_and_test(&stripe->pending_io)) + wake_up(&stripe->io_wait); +} + +static int calc_next_mirror(int mirror, int num_copies) +{ + ASSERT(mirror <= num_copies); + return (mirror + 1 > num_copies) ? 1 : mirror + 1; +} + +static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, + int mirror, int blocksize, bool wait) +{ + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct btrfs_bio *bbio = NULL; + const unsigned long old_error_bitmap = stripe->error_bitmap; + int i; + + ASSERT(stripe->mirror_num >= 1); + ASSERT(atomic_read(&stripe->pending_io) == 0); + + for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { + struct page *page; + int pgoff; + int ret; + + page = scrub_stripe_get_page(stripe, i); + pgoff = scrub_stripe_get_page_offset(stripe, i); + + /* The current sector cannot be merged, submit the bio. */ + if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) || + bbio->bio.bi_iter.bi_size >= blocksize)) { + ASSERT(bbio->bio.bi_iter.bi_size); + atomic_inc(&stripe->pending_io); + btrfs_submit_bio(bbio, mirror); + if (wait) + wait_scrub_stripe_io(stripe); + bbio = NULL; + } + + if (!bbio) { + bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, + fs_info, scrub_repair_read_endio, stripe); + bbio->bio.bi_iter.bi_sector = (stripe->logical + + (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT; + } + + ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); + ASSERT(ret == fs_info->sectorsize); + } + if (bbio) { + ASSERT(bbio->bio.bi_iter.bi_size); + atomic_inc(&stripe->pending_io); + btrfs_submit_bio(bbio, mirror); + if (wait) + wait_scrub_stripe_io(stripe); + } +} + +static void scrub_stripe_report_errors(struct scrub_ctx *sctx, + struct scrub_stripe *stripe) +{ + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_device *dev = NULL; + u64 physical = 0; + int nr_data_sectors = 0; + int nr_meta_sectors = 0; + int nr_nodatacsum_sectors = 0; + int nr_repaired_sectors = 0; + int sector_nr; + + if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state)) + return; + + /* + * Init needed infos for error reporting. + * + * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio() + * thus no need for dev/physical, error reporting still needs dev and physical. + */ + if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) { + u64 mapped_len = fs_info->sectorsize; + struct btrfs_io_context *bioc = NULL; + int stripe_index = stripe->mirror_num - 1; + int ret; + + /* For scrub, our mirror_num should always start at 1. */ + ASSERT(stripe->mirror_num >= 1); + ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, + stripe->logical, &mapped_len, &bioc, + NULL, NULL, 1); + /* + * If we failed, dev will be NULL, and later detailed reports + * will just be skipped. + */ + if (ret < 0) + goto skip; + physical = bioc->stripes[stripe_index].physical; + dev = bioc->stripes[stripe_index].dev; + btrfs_put_bioc(bioc); + } + +skip: + for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) { + bool repaired = false; + + if (stripe->sectors[sector_nr].is_metadata) { + nr_meta_sectors++; + } else { + nr_data_sectors++; + if (!stripe->sectors[sector_nr].csum) + nr_nodatacsum_sectors++; + } + + if (test_bit(sector_nr, &stripe->init_error_bitmap) && + !test_bit(sector_nr, &stripe->error_bitmap)) { + nr_repaired_sectors++; + repaired = true; + } + + /* Good sector from the beginning, nothing need to be done. */ + if (!test_bit(sector_nr, &stripe->init_error_bitmap)) + continue; + + /* + * Report error for the corrupted sectors. If repaired, just + * output the message of repaired message. + */ + if (repaired) { + if (dev) { + btrfs_err_rl_in_rcu(fs_info, + "fixed up error at logical %llu on dev %s physical %llu", + stripe->logical, btrfs_dev_name(dev), + physical); + } else { + btrfs_err_rl_in_rcu(fs_info, + "fixed up error at logical %llu on mirror %u", + stripe->logical, stripe->mirror_num); + } + continue; + } + + /* The remaining are all for unrepaired. */ + if (dev) { + btrfs_err_rl_in_rcu(fs_info, + "unable to fixup (regular) error at logical %llu on dev %s physical %llu", + stripe->logical, btrfs_dev_name(dev), + physical); + } else { + btrfs_err_rl_in_rcu(fs_info, + "unable to fixup (regular) error at logical %llu on mirror %u", + stripe->logical, stripe->mirror_num); + } + + if (test_bit(sector_nr, &stripe->io_error_bitmap)) + if (__ratelimit(&rs) && dev) + scrub_print_common_warning("i/o error", dev, false, + stripe->logical, physical); + if (test_bit(sector_nr, &stripe->csum_error_bitmap)) + if (__ratelimit(&rs) && dev) + scrub_print_common_warning("checksum error", dev, false, + stripe->logical, physical); + if (test_bit(sector_nr, &stripe->meta_error_bitmap)) + if (__ratelimit(&rs) && dev) + scrub_print_common_warning("header error", dev, false, + stripe->logical, physical); + } + + spin_lock(&sctx->stat_lock); + sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; + sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; + sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits; + sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits; + sctx->stat.no_csum += nr_nodatacsum_sectors; + sctx->stat.read_errors += stripe->init_nr_io_errors; + sctx->stat.csum_errors += stripe->init_nr_csum_errors; + sctx->stat.verify_errors += stripe->init_nr_meta_errors; + sctx->stat.uncorrectable_errors += + bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors); + sctx->stat.corrected_errors += nr_repaired_sectors; + spin_unlock(&sctx->stat_lock); +} + +static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, + unsigned long write_bitmap, bool dev_replace); + +/* + * The main entrance for all read related scrub work, including: + * + * - Wait for the initial read to finish + * - Verify and locate any bad sectors + * - Go through the remaining mirrors and try to read as large blocksize as + * possible + * - Go through all mirrors (including the failed mirror) sector-by-sector + * - Submit writeback for repaired sectors + * + * Writeback for dev-replace does not happen here, it needs extra + * synchronization for zoned devices. + */ +static void scrub_stripe_read_repair_worker(struct work_struct *work) +{ + struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); + struct scrub_ctx *sctx = stripe->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; + int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, + stripe->bg->length); + int mirror; + int i; + + ASSERT(stripe->mirror_num > 0); + + wait_scrub_stripe_io(stripe); + scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap); + /* Save the initial failed bitmap for later repair and report usage. */ + stripe->init_error_bitmap = stripe->error_bitmap; + stripe->init_nr_io_errors = bitmap_weight(&stripe->io_error_bitmap, + stripe->nr_sectors); + stripe->init_nr_csum_errors = bitmap_weight(&stripe->csum_error_bitmap, + stripe->nr_sectors); + stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap, + stripe->nr_sectors); + + if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) + goto out; + + /* + * Try all remaining mirrors. + * + * Here we still try to read as large block as possible, as this is + * faster and we have extra safety nets to rely on. + */ + for (mirror = calc_next_mirror(stripe->mirror_num, num_copies); + mirror != stripe->mirror_num; + mirror = calc_next_mirror(mirror, num_copies)) { + const unsigned long old_error_bitmap = stripe->error_bitmap; + + scrub_stripe_submit_repair_read(stripe, mirror, + BTRFS_STRIPE_LEN, false); + wait_scrub_stripe_io(stripe); + scrub_verify_one_stripe(stripe, old_error_bitmap); + if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + goto out; + } + + /* + * Last safety net, try re-checking all mirrors, including the failed + * one, sector-by-sector. + * + * As if one sector failed the drive's internal csum, the whole read + * containing the offending sector would be marked as error. + * Thus here we do sector-by-sector read. + * + * This can be slow, thus we only try it as the last resort. + */ + + for (i = 0, mirror = stripe->mirror_num; + i < num_copies; + i++, mirror = calc_next_mirror(mirror, num_copies)) { + const unsigned long old_error_bitmap = stripe->error_bitmap; + + scrub_stripe_submit_repair_read(stripe, mirror, + fs_info->sectorsize, true); + wait_scrub_stripe_io(stripe); + scrub_verify_one_stripe(stripe, old_error_bitmap); + if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + goto out; + } +out: + /* + * Submit the repaired sectors. For zoned case, we cannot do repair + * in-place, but queue the bg to be relocated. + */ + if (btrfs_is_zoned(fs_info)) { + if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start); + } else if (!sctx->readonly) { + unsigned long repaired; + + bitmap_andnot(&repaired, &stripe->init_error_bitmap, + &stripe->error_bitmap, stripe->nr_sectors); + scrub_write_sectors(sctx, stripe, repaired, false); + wait_scrub_stripe_io(stripe); + } + + scrub_stripe_report_errors(sctx, stripe); + set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); + wake_up(&stripe->repair_wait); +} + +static void scrub_read_endio(struct btrfs_bio *bbio) +{ + struct scrub_stripe *stripe = bbio->private; + struct bio_vec *bvec; + int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); + int num_sectors; + u32 bio_size = 0; + int i; + + ASSERT(sector_nr < stripe->nr_sectors); + bio_for_each_bvec_all(bvec, &bbio->bio, i) + bio_size += bvec->bv_len; + num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; + + if (bbio->bio.bi_status) { + bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors); + bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors); + } else { + bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors); + } + bio_put(&bbio->bio); + if (atomic_dec_and_test(&stripe->pending_io)) { + wake_up(&stripe->io_wait); + INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); + queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); + } +} + +static void scrub_write_endio(struct btrfs_bio *bbio) +{ + struct scrub_stripe *stripe = bbio->private; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct bio_vec *bvec; + int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); + u32 bio_size = 0; + int i; + + bio_for_each_bvec_all(bvec, &bbio->bio, i) + bio_size += bvec->bv_len; + + if (bbio->bio.bi_status) { + unsigned long flags; + + spin_lock_irqsave(&stripe->write_error_lock, flags); + bitmap_set(&stripe->write_error_bitmap, sector_nr, + bio_size >> fs_info->sectorsize_bits); + spin_unlock_irqrestore(&stripe->write_error_lock, flags); + } + bio_put(&bbio->bio); + + if (atomic_dec_and_test(&stripe->pending_io)) + wake_up(&stripe->io_wait); +} + +static void scrub_submit_write_bio(struct scrub_ctx *sctx, + struct scrub_stripe *stripe, + struct btrfs_bio *bbio, bool dev_replace) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + u32 bio_len = bbio->bio.bi_iter.bi_size; + u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) - + stripe->logical; + + fill_writer_pointer_gap(sctx, stripe->physical + bio_off); + atomic_inc(&stripe->pending_io); + btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace); + if (!btrfs_is_zoned(fs_info)) + return; + /* + * For zoned writeback, queue depth must be 1, thus we must wait for + * the write to finish before the next write. + */ + wait_scrub_stripe_io(stripe); + + /* + * And also need to update the write pointer if write finished + * successfully. + */ + if (!test_bit(bio_off >> fs_info->sectorsize_bits, + &stripe->write_error_bitmap)) + sctx->write_pointer += bio_len; +} + +/* + * Submit the write bio(s) for the sectors specified by @write_bitmap. + * + * Here we utilize btrfs_submit_repair_write(), which has some extra benefits: + * + * - Only needs logical bytenr and mirror_num + * Just like the scrub read path + * + * - Would only result in writes to the specified mirror + * Unlike the regular writeback path, which would write back to all stripes + * + * - Handle dev-replace and read-repair writeback differently + */ +static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, + unsigned long write_bitmap, bool dev_replace) +{ + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct btrfs_bio *bbio = NULL; + int sector_nr; + + for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) { + struct page *page = scrub_stripe_get_page(stripe, sector_nr); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); + int ret; + + /* We should only writeback sectors covered by an extent. */ + ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap)); + + /* Cannot merge with previous sector, submit the current one. */ + if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) { + scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); + bbio = NULL; + } + if (!bbio) { + bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE, + fs_info, scrub_write_endio, stripe); + bbio->bio.bi_iter.bi_sector = (stripe->logical + + (sector_nr << fs_info->sectorsize_bits)) >> + SECTOR_SHIFT; + } + ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); + ASSERT(ret == fs_info->sectorsize); + } + if (bbio) + scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); +} + +/* + * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 + * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. + */ +static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device, + unsigned int bio_size) +{ + const int time_slice = 1000; + s64 delta; + ktime_t now; + u32 div; + u64 bwlimit; + + bwlimit = READ_ONCE(device->scrub_speed_max); + if (bwlimit == 0) + return; + + /* + * Slice is divided into intervals when the IO is submitted, adjust by + * bwlimit and maximum of 64 intervals. + */ + div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); + div = min_t(u32, 64, div); + + /* Start new epoch, set deadline */ + now = ktime_get(); + if (sctx->throttle_deadline == 0) { + sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); + sctx->throttle_sent = 0; + } + + /* Still in the time to send? */ + if (ktime_before(now, sctx->throttle_deadline)) { + /* If current bio is within the limit, send it */ + sctx->throttle_sent += bio_size; + if (sctx->throttle_sent <= div_u64(bwlimit, div)) + return; + + /* We're over the limit, sleep until the rest of the slice */ + delta = ktime_ms_delta(sctx->throttle_deadline, now); + } else { + /* New request after deadline, start new epoch */ + delta = 0; + } + + if (delta) { + long timeout; + + timeout = div_u64(delta * HZ, 1000); + schedule_timeout_interruptible(timeout); + } + + /* Next call will start the deadline period */ + sctx->throttle_deadline = 0; +} + +/* + * Given a physical address, this will calculate it's + * logical offset. if this is a parity stripe, it will return + * the most left data stripe's logical offset. + * + * return 0 if it is a data stripe, 1 means parity stripe. + */ +static int get_raid56_logic_offset(u64 physical, int num, + struct map_lookup *map, u64 *offset, + u64 *stripe_start) +{ + int i; + int j = 0; + u64 last_offset; + const int data_stripes = nr_data_stripes(map); + + last_offset = (physical - map->stripes[num].physical) * data_stripes; + if (stripe_start) + *stripe_start = last_offset; + + *offset = last_offset; + for (i = 0; i < data_stripes; i++) { + u32 stripe_nr; + u32 stripe_index; + u32 rot; + + *offset = last_offset + btrfs_stripe_nr_to_offset(i); + + stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes; + + /* Work out the disk rotation on this stripe-set */ + rot = stripe_nr % map->num_stripes; + /* calculate which stripe this data locates */ + rot += i; + stripe_index = rot % map->num_stripes; + if (stripe_index == num) + return 0; + if (stripe_index < num) + j++; + } + *offset = last_offset + btrfs_stripe_nr_to_offset(j); + return 1; +} + +/* + * Return 0 if the extent item range covers any byte of the range. + * Return <0 if the extent item is before @search_start. + * Return >0 if the extent item is after @start_start + @search_len. + */ +static int compare_extent_item_range(struct btrfs_path *path, + u64 search_start, u64 search_len) +{ + struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info; + u64 len; + struct btrfs_key key; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY); + if (key.type == BTRFS_METADATA_ITEM_KEY) + len = fs_info->nodesize; + else + len = key.offset; + + if (key.objectid + len <= search_start) + return -1; + if (key.objectid >= search_start + search_len) + return 1; + return 0; +} + +/* + * Locate one extent item which covers any byte in range + * [@search_start, @search_start + @search_length) + * + * If the path is not initialized, we will initialize the search by doing + * a btrfs_search_slot(). + * If the path is already initialized, we will use the path as the initial + * slot, to avoid duplicated btrfs_search_slot() calls. + * + * NOTE: If an extent item starts before @search_start, we will still + * return the extent item. This is for data extent crossing stripe boundary. + * + * Return 0 if we found such extent item, and @path will point to the extent item. + * Return >0 if no such extent item can be found, and @path will be released. + * Return <0 if hit fatal error, and @path will be released. + */ +static int find_first_extent_item(struct btrfs_root *extent_root, + struct btrfs_path *path, + u64 search_start, u64 search_len) +{ + struct btrfs_fs_info *fs_info = extent_root->fs_info; + struct btrfs_key key; + int ret; + + /* Continue using the existing path */ + if (path->nodes[0]) + goto search_forward; + + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = search_start; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + ASSERT(ret > 0); + /* + * Here we intentionally pass 0 as @min_objectid, as there could be + * an extent item starting before @search_start. + */ + ret = btrfs_previous_extent_item(extent_root, path, 0); + if (ret < 0) + return ret; + /* + * No matter whether we have found an extent item, the next loop will + * properly do every check on the key. + */ +search_forward: + while (true) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid >= search_start + search_len) + break; + if (key.type != BTRFS_METADATA_ITEM_KEY && + key.type != BTRFS_EXTENT_ITEM_KEY) + goto next; + + ret = compare_extent_item_range(path, search_start, search_len); + if (ret == 0) + return ret; + if (ret > 0) + break; +next: + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(extent_root, path); + if (ret) { + /* Either no more item or fatal error */ + btrfs_release_path(path); + return ret; + } + } + } + btrfs_release_path(path); + return 1; +} + +static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, + u64 *size_ret, u64 *flags_ret, u64 *generation_ret) +{ + struct btrfs_key key; + struct btrfs_extent_item *ei; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + ASSERT(key.type == BTRFS_METADATA_ITEM_KEY || + key.type == BTRFS_EXTENT_ITEM_KEY); + *extent_start_ret = key.objectid; + if (key.type == BTRFS_METADATA_ITEM_KEY) + *size_ret = path->nodes[0]->fs_info->nodesize; + else + *size_ret = key.offset; + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); + *flags_ret = btrfs_extent_flags(path->nodes[0], ei); + *generation_ret = btrfs_extent_generation(path->nodes[0], ei); +} + +static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, + u64 physical, u64 physical_end) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + int ret = 0; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + mutex_lock(&sctx->wr_lock); + if (sctx->write_pointer < physical_end) { + ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, + physical, + sctx->write_pointer); + if (ret) + btrfs_err(fs_info, + "zoned: failed to recover write pointer"); + } + mutex_unlock(&sctx->wr_lock); + btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); + + return ret; +} + +static void fill_one_extent_info(struct btrfs_fs_info *fs_info, + struct scrub_stripe *stripe, + u64 extent_start, u64 extent_len, + u64 extent_flags, u64 extent_gen) +{ + for (u64 cur_logical = max(stripe->logical, extent_start); + cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN, + extent_start + extent_len); + cur_logical += fs_info->sectorsize) { + const int nr_sector = (cur_logical - stripe->logical) >> + fs_info->sectorsize_bits; + struct scrub_sector_verification *sector = + &stripe->sectors[nr_sector]; + + set_bit(nr_sector, &stripe->extent_sector_bitmap); + if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + sector->is_metadata = true; + sector->generation = extent_gen; + } + } +} + +static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) +{ + stripe->extent_sector_bitmap = 0; + stripe->init_error_bitmap = 0; + stripe->init_nr_io_errors = 0; + stripe->init_nr_csum_errors = 0; + stripe->init_nr_meta_errors = 0; + stripe->error_bitmap = 0; + stripe->io_error_bitmap = 0; + stripe->csum_error_bitmap = 0; + stripe->meta_error_bitmap = 0; +} + +/* + * Locate one stripe which has at least one extent in its range. + * + * Return 0 if found such stripe, and store its info into @stripe. + * Return >0 if there is no such stripe in the specified range. + * Return <0 for error. + */ +static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, + struct btrfs_path *extent_path, + struct btrfs_path *csum_path, + struct btrfs_device *dev, u64 physical, + int mirror_num, u64 logical_start, + u32 logical_len, + struct scrub_stripe *stripe) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start); + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start); + const u64 logical_end = logical_start + logical_len; + u64 cur_logical = logical_start; + u64 stripe_end; + u64 extent_start; + u64 extent_len; + u64 extent_flags; + u64 extent_gen; + int ret; + + memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * + stripe->nr_sectors); + scrub_stripe_reset_bitmaps(stripe); + + /* The range must be inside the bg. */ + ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); + + ret = find_first_extent_item(extent_root, extent_path, logical_start, + logical_len); + /* Either error or not found. */ + if (ret) + goto out; + get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, + &extent_gen); + if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + stripe->nr_meta_extents++; + if (extent_flags & BTRFS_EXTENT_FLAG_DATA) + stripe->nr_data_extents++; + cur_logical = max(extent_start, cur_logical); + + /* + * Round down to stripe boundary. + * + * The extra calculation against bg->start is to handle block groups + * whose logical bytenr is not BTRFS_STRIPE_LEN aligned. + */ + stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) + + bg->start; + stripe->physical = physical + stripe->logical - logical_start; + stripe->dev = dev; + stripe->bg = bg; + stripe->mirror_num = mirror_num; + stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1; + + /* Fill the first extent info into stripe->sectors[] array. */ + fill_one_extent_info(fs_info, stripe, extent_start, extent_len, + extent_flags, extent_gen); + cur_logical = extent_start + extent_len; + + /* Fill the extent info for the remaining sectors. */ + while (cur_logical <= stripe_end) { + ret = find_first_extent_item(extent_root, extent_path, cur_logical, + stripe_end - cur_logical + 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + break; + } + get_extent_info(extent_path, &extent_start, &extent_len, + &extent_flags, &extent_gen); + if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + stripe->nr_meta_extents++; + if (extent_flags & BTRFS_EXTENT_FLAG_DATA) + stripe->nr_data_extents++; + fill_one_extent_info(fs_info, stripe, extent_start, extent_len, + extent_flags, extent_gen); + cur_logical = extent_start + extent_len; + } + + /* Now fill the data csum. */ + if (bg->flags & BTRFS_BLOCK_GROUP_DATA) { + int sector_nr; + unsigned long csum_bitmap = 0; + + /* Csum space should have already been allocated. */ + ASSERT(stripe->csums); + + /* + * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN + * should contain at most 16 sectors. + */ + ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); + + ret = btrfs_lookup_csums_bitmap(csum_root, csum_path, + stripe->logical, stripe_end, + stripe->csums, &csum_bitmap); + if (ret < 0) + goto out; + if (ret > 0) + ret = 0; + + for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) { + stripe->sectors[sector_nr].csum = stripe->csums + + sector_nr * fs_info->csum_size; + } + } + set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); +out: + return ret; +} + +static void scrub_reset_stripe(struct scrub_stripe *stripe) +{ + scrub_stripe_reset_bitmaps(stripe); + + stripe->nr_meta_extents = 0; + stripe->nr_data_extents = 0; + stripe->state = 0; + + for (int i = 0; i < stripe->nr_sectors; i++) { + stripe->sectors[i].is_metadata = false; + stripe->sectors[i].csum = NULL; + stripe->sectors[i].generation = 0; + } +} + +static void scrub_submit_initial_read(struct scrub_ctx *sctx, + struct scrub_stripe *stripe) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_bio *bbio; + unsigned int nr_sectors = min_t(u64, BTRFS_STRIPE_LEN, stripe->bg->start + + stripe->bg->length - stripe->logical) >> + fs_info->sectorsize_bits; + int mirror = stripe->mirror_num; + + ASSERT(stripe->bg); + ASSERT(stripe->mirror_num > 0); + ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); + + bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, + scrub_read_endio, stripe); + + bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; + /* Read the whole range inside the chunk boundary. */ + for (unsigned int cur = 0; cur < nr_sectors; cur++) { + struct page *page = scrub_stripe_get_page(stripe, cur); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur); + int ret; + + ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); + /* We should have allocated enough bio vectors. */ + ASSERT(ret == fs_info->sectorsize); + } + atomic_inc(&stripe->pending_io); + + /* + * For dev-replace, either user asks to avoid the source dev, or + * the device is missing, we try the next mirror instead. + */ + if (sctx->is_dev_replace && + (fs_info->dev_replace.cont_reading_from_srcdev_mode == + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID || + !stripe->dev->bdev)) { + int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, + stripe->bg->length); + + mirror = calc_next_mirror(mirror, num_copies); + } + btrfs_submit_bio(bbio, mirror); +} + +static bool stripe_has_metadata_error(struct scrub_stripe *stripe) +{ + int i; + + for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) { + if (stripe->sectors[i].is_metadata) { + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + + btrfs_err(fs_info, + "stripe %llu has unrepaired metadata sector at %llu", + stripe->logical, + stripe->logical + (i << fs_info->sectorsize_bits)); + return true; + } + } + return false; +} + +static void submit_initial_group_read(struct scrub_ctx *sctx, + unsigned int first_slot, + unsigned int nr_stripes) +{ + struct blk_plug plug; + + ASSERT(first_slot < SCRUB_TOTAL_STRIPES); + ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES); + + scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, + btrfs_stripe_nr_to_offset(nr_stripes)); + blk_start_plug(&plug); + for (int i = 0; i < nr_stripes; i++) { + struct scrub_stripe *stripe = &sctx->stripes[first_slot + i]; + + /* Those stripes should be initialized. */ + ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); + scrub_submit_initial_read(sctx, stripe); + } + blk_finish_plug(&plug); +} + +static int flush_scrub_stripes(struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct scrub_stripe *stripe; + const int nr_stripes = sctx->cur_stripe; + int ret = 0; + + if (!nr_stripes) + return 0; + + ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); + + /* Submit the stripes which are populated but not submitted. */ + if (nr_stripes % SCRUB_STRIPES_PER_GROUP) { + const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP); + + submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot); + } + + for (int i = 0; i < nr_stripes; i++) { + stripe = &sctx->stripes[i]; + + wait_event(stripe->repair_wait, + test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); + } + + /* Submit for dev-replace. */ + if (sctx->is_dev_replace) { + /* + * For dev-replace, if we know there is something wrong with + * metadata, we should immedately abort. + */ + for (int i = 0; i < nr_stripes; i++) { + if (stripe_has_metadata_error(&sctx->stripes[i])) { + ret = -EIO; + goto out; + } + } + for (int i = 0; i < nr_stripes; i++) { + unsigned long good; + + stripe = &sctx->stripes[i]; + + ASSERT(stripe->dev == fs_info->dev_replace.srcdev); + + bitmap_andnot(&good, &stripe->extent_sector_bitmap, + &stripe->error_bitmap, stripe->nr_sectors); + scrub_write_sectors(sctx, stripe, good, true); + } + } + + /* Wait for the above writebacks to finish. */ + for (int i = 0; i < nr_stripes; i++) { + stripe = &sctx->stripes[i]; + + wait_scrub_stripe_io(stripe); + scrub_reset_stripe(stripe); + } +out: + sctx->cur_stripe = 0; + return ret; +} + +static void raid56_scrub_wait_endio(struct bio *bio) +{ + complete(bio->bi_private); +} + +static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, + struct btrfs_device *dev, int mirror_num, + u64 logical, u32 length, u64 physical, + u64 *found_logical_ret) +{ + struct scrub_stripe *stripe; + int ret; + + /* + * There should always be one slot left, as caller filling the last + * slot should flush them all. + */ + ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES); + + /* @found_logical_ret must be specified. */ + ASSERT(found_logical_ret); + + stripe = &sctx->stripes[sctx->cur_stripe]; + scrub_reset_stripe(stripe); + ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, + &sctx->csum_path, dev, physical, + mirror_num, logical, length, stripe); + /* Either >0 as no more extents or <0 for error. */ + if (ret) + return ret; + *found_logical_ret = stripe->logical; + sctx->cur_stripe++; + + /* We filled one group, submit it. */ + if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) { + const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP; + + submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP); + } + + /* Last slot used, flush them all. */ + if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES) + return flush_scrub_stripes(sctx); + return 0; +} + +static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev, + struct btrfs_block_group *bg, + struct map_lookup *map, + u64 full_stripe_start) +{ + DECLARE_COMPLETION_ONSTACK(io_done); + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_raid_bio *rbio; + struct btrfs_io_context *bioc = NULL; + struct btrfs_path extent_path = { 0 }; + struct btrfs_path csum_path = { 0 }; + struct bio *bio; + struct scrub_stripe *stripe; + bool all_empty = true; + const int data_stripes = nr_data_stripes(map); + unsigned long extent_bitmap = 0; + u64 length = btrfs_stripe_nr_to_offset(data_stripes); + int ret; + + ASSERT(sctx->raid56_data_stripes); + + /* + * For data stripe search, we cannot re-use the same extent/csum paths, + * as the data stripe bytenr may be smaller than previous extent. Thus + * we have to use our own extent/csum paths. + */ + extent_path.search_commit_root = 1; + extent_path.skip_locking = 1; + csum_path.search_commit_root = 1; + csum_path.skip_locking = 1; + + for (int i = 0; i < data_stripes; i++) { + int stripe_index; + int rot; + u64 physical; + + stripe = &sctx->raid56_data_stripes[i]; + rot = div_u64(full_stripe_start - bg->start, + data_stripes) >> BTRFS_STRIPE_LEN_SHIFT; + stripe_index = (i + rot) % map->num_stripes; + physical = map->stripes[stripe_index].physical + + btrfs_stripe_nr_to_offset(rot); + + scrub_reset_stripe(stripe); + set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); + ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path, + map->stripes[stripe_index].dev, physical, 1, + full_stripe_start + btrfs_stripe_nr_to_offset(i), + BTRFS_STRIPE_LEN, stripe); + if (ret < 0) + goto out; + /* + * No extent in this data stripe, need to manually mark them + * initialized to make later read submission happy. + */ + if (ret > 0) { + stripe->logical = full_stripe_start + + btrfs_stripe_nr_to_offset(i); + stripe->dev = map->stripes[stripe_index].dev; + stripe->mirror_num = 1; + set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); + } + } + + /* Check if all data stripes are empty. */ + for (int i = 0; i < data_stripes; i++) { + stripe = &sctx->raid56_data_stripes[i]; + if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) { + all_empty = false; + break; + } + } + if (all_empty) { + ret = 0; + goto out; + } + + for (int i = 0; i < data_stripes; i++) { + stripe = &sctx->raid56_data_stripes[i]; + scrub_submit_initial_read(sctx, stripe); + } + for (int i = 0; i < data_stripes; i++) { + stripe = &sctx->raid56_data_stripes[i]; + + wait_event(stripe->repair_wait, + test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); + } + /* For now, no zoned support for RAID56. */ + ASSERT(!btrfs_is_zoned(sctx->fs_info)); + + /* + * Now all data stripes are properly verified. Check if we have any + * unrepaired, if so abort immediately or we could further corrupt the + * P/Q stripes. + * + * During the loop, also populate extent_bitmap. + */ + for (int i = 0; i < data_stripes; i++) { + unsigned long error; + + stripe = &sctx->raid56_data_stripes[i]; + + /* + * We should only check the errors where there is an extent. + * As we may hit an empty data stripe while it's missing. + */ + bitmap_and(&error, &stripe->error_bitmap, + &stripe->extent_sector_bitmap, stripe->nr_sectors); + if (!bitmap_empty(&error, stripe->nr_sectors)) { + btrfs_err(fs_info, +"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", + full_stripe_start, i, stripe->nr_sectors, + &error); + ret = -EIO; + goto out; + } + bitmap_or(&extent_bitmap, &extent_bitmap, + &stripe->extent_sector_bitmap, stripe->nr_sectors); + } + + /* Now we can check and regenerate the P/Q stripe. */ + bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS); + bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT; + bio->bi_private = &io_done; + bio->bi_end_io = raid56_scrub_wait_endio; + + btrfs_bio_counter_inc_blocked(fs_info); + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, + &length, &bioc, NULL, NULL, 1); + if (ret < 0) { + btrfs_put_bioc(bioc); + btrfs_bio_counter_dec(fs_info); + goto out; + } + rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap, + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); + btrfs_put_bioc(bioc); + if (!rbio) { + ret = -ENOMEM; + btrfs_bio_counter_dec(fs_info); + goto out; + } + /* Use the recovered stripes as cache to avoid read them from disk again. */ + for (int i = 0; i < data_stripes; i++) { + stripe = &sctx->raid56_data_stripes[i]; + + raid56_parity_cache_data_pages(rbio, stripe->pages, + full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); + } + raid56_parity_submit_scrub_rbio(rbio); + wait_for_completion_io(&io_done); + ret = blk_status_to_errno(bio->bi_status); + bio_put(bio); + btrfs_bio_counter_dec(fs_info); + + btrfs_release_path(&extent_path); + btrfs_release_path(&csum_path); +out: + return ret; +} + +/* + * Scrub one range which can only has simple mirror based profile. + * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in + * RAID0/RAID10). + * + * Since we may need to handle a subset of block group, we need @logical_start + * and @logical_length parameter. + */ +static int scrub_simple_mirror(struct scrub_ctx *sctx, + struct btrfs_block_group *bg, + struct map_lookup *map, + u64 logical_start, u64 logical_length, + struct btrfs_device *device, + u64 physical, int mirror_num) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + const u64 logical_end = logical_start + logical_length; + u64 cur_logical = logical_start; + int ret; + + /* The range must be inside the bg */ + ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); + + /* Go through each extent items inside the logical range */ + while (cur_logical < logical_end) { + u64 found_logical = U64_MAX; + u64 cur_physical = physical + cur_logical - logical_start; + + /* Canceled? */ + if (atomic_read(&fs_info->scrub_cancel_req) || + atomic_read(&sctx->cancel_req)) { + ret = -ECANCELED; + break; + } + /* Paused? */ + if (atomic_read(&fs_info->scrub_pause_req)) { + /* Push queued extents */ + scrub_blocked_if_needed(fs_info); + } + /* Block group removed? */ + spin_lock(&bg->lock); + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { + spin_unlock(&bg->lock); + ret = 0; + break; + } + spin_unlock(&bg->lock); + + ret = queue_scrub_stripe(sctx, bg, device, mirror_num, + cur_logical, logical_end - cur_logical, + cur_physical, &found_logical); + if (ret > 0) { + /* No more extent, just update the accounting */ + sctx->stat.last_physical = physical + logical_length; + ret = 0; + break; + } + if (ret < 0) + break; + + /* queue_scrub_stripe() returned 0, @found_logical must be updated. */ + ASSERT(found_logical != U64_MAX); + cur_logical = found_logical + BTRFS_STRIPE_LEN; + + /* Don't hold CPU for too long time */ + cond_resched(); + } + return ret; +} + +/* Calculate the full stripe length for simple stripe based profiles */ +static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) +{ + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)); + + return btrfs_stripe_nr_to_offset(map->num_stripes / map->sub_stripes); +} + +/* Get the logical bytenr for the stripe */ +static u64 simple_stripe_get_logical(struct map_lookup *map, + struct btrfs_block_group *bg, + int stripe_index) +{ + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)); + ASSERT(stripe_index < map->num_stripes); + + /* + * (stripe_index / sub_stripes) gives how many data stripes we need to + * skip. + */ + return btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes) + + bg->start; +} + +/* Get the mirror number for the stripe */ +static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) +{ + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)); + ASSERT(stripe_index < map->num_stripes); + + /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */ + return stripe_index % map->sub_stripes + 1; +} + +static int scrub_simple_stripe(struct scrub_ctx *sctx, + struct btrfs_block_group *bg, + struct map_lookup *map, + struct btrfs_device *device, + int stripe_index) +{ + const u64 logical_increment = simple_stripe_full_stripe_len(map); + const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index); + const u64 orig_physical = map->stripes[stripe_index].physical; + const int mirror_num = simple_stripe_mirror_num(map, stripe_index); + u64 cur_logical = orig_logical; + u64 cur_physical = orig_physical; + int ret = 0; + + while (cur_logical < bg->start + bg->length) { + /* + * Inside each stripe, RAID0 is just SINGLE, and RAID10 is + * just RAID1, so we can reuse scrub_simple_mirror() to scrub + * this stripe. + */ + ret = scrub_simple_mirror(sctx, bg, map, cur_logical, + BTRFS_STRIPE_LEN, device, cur_physical, + mirror_num); + if (ret) + return ret; + /* Skip to next stripe which belongs to the target device */ + cur_logical += logical_increment; + /* For physical offset, we just go to next stripe */ + cur_physical += BTRFS_STRIPE_LEN; + } + return ret; +} + +static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, + struct btrfs_block_group *bg, + struct extent_map *em, + struct btrfs_device *scrub_dev, + int stripe_index) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct map_lookup *map = em->map_lookup; + const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; + const u64 chunk_logical = bg->start; + int ret; + int ret2; + u64 physical = map->stripes[stripe_index].physical; + const u64 dev_stripe_len = btrfs_calc_stripe_length(em); + const u64 physical_end = physical + dev_stripe_len; + u64 logical; + u64 logic_end; + /* The logical increment after finishing one stripe */ + u64 increment; + /* Offset inside the chunk */ + u64 offset; + u64 stripe_logical; + int stop_loop = 0; + + /* Extent_path should be released by now. */ + ASSERT(sctx->extent_path.nodes[0] == NULL); + + scrub_blocked_if_needed(fs_info); + + if (sctx->is_dev_replace && + btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) { + mutex_lock(&sctx->wr_lock); + sctx->write_pointer = physical; + mutex_unlock(&sctx->wr_lock); + } + + /* Prepare the extra data stripes used by RAID56. */ + if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ASSERT(sctx->raid56_data_stripes == NULL); + + sctx->raid56_data_stripes = kcalloc(nr_data_stripes(map), + sizeof(struct scrub_stripe), + GFP_KERNEL); + if (!sctx->raid56_data_stripes) { + ret = -ENOMEM; + goto out; + } + for (int i = 0; i < nr_data_stripes(map); i++) { + ret = init_scrub_stripe(fs_info, + &sctx->raid56_data_stripes[i]); + if (ret < 0) + goto out; + sctx->raid56_data_stripes[i].bg = bg; + sctx->raid56_data_stripes[i].sctx = sctx; + } + } + /* + * There used to be a big double loop to handle all profiles using the + * same routine, which grows larger and more gross over time. + * + * So here we handle each profile differently, so simpler profiles + * have simpler scrubbing function. + */ + if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID56_MASK))) { + /* + * Above check rules out all complex profile, the remaining + * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple + * mirrored duplication without stripe. + * + * Only @physical and @mirror_num needs to calculated using + * @stripe_index. + */ + ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length, + scrub_dev, map->stripes[stripe_index].physical, + stripe_index + 1); + offset = 0; + goto out; + } + if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { + ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index); + offset = btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes); + goto out; + } + + /* Only RAID56 goes through the old code */ + ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); + ret = 0; + + /* Calculate the logical end of the stripe */ + get_raid56_logic_offset(physical_end, stripe_index, + map, &logic_end, NULL); + logic_end += chunk_logical; + + /* Initialize @offset in case we need to go to out: label */ + get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); + increment = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); + + /* + * Due to the rotation, for RAID56 it's better to iterate each stripe + * using their physical offset. + */ + while (physical < physical_end) { + ret = get_raid56_logic_offset(physical, stripe_index, map, + &logical, &stripe_logical); + logical += chunk_logical; + if (ret) { + /* it is parity strip */ + stripe_logical += chunk_logical; + ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg, + map, stripe_logical); + if (ret) + goto out; + goto next; + } + + /* + * Now we're at a data stripe, scrub each extents in the range. + * + * At this stage, if we ignore the repair part, inside each data + * stripe it is no different than SINGLE profile. + * We can reuse scrub_simple_mirror() here, as the repair part + * is still based on @mirror_num. + */ + ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN, + scrub_dev, physical, 1); + if (ret < 0) + goto out; +next: + logical += increment; + physical += BTRFS_STRIPE_LEN; + spin_lock(&sctx->stat_lock); + if (stop_loop) + sctx->stat.last_physical = + map->stripes[stripe_index].physical + dev_stripe_len; + else + sctx->stat.last_physical = physical; + spin_unlock(&sctx->stat_lock); + if (stop_loop) + break; + } +out: + ret2 = flush_scrub_stripes(sctx); + if (!ret) + ret = ret2; + btrfs_release_path(&sctx->extent_path); + btrfs_release_path(&sctx->csum_path); + + if (sctx->raid56_data_stripes) { + for (int i = 0; i < nr_data_stripes(map); i++) + release_scrub_stripe(&sctx->raid56_data_stripes[i]); + kfree(sctx->raid56_data_stripes); + sctx->raid56_data_stripes = NULL; + } + + if (sctx->is_dev_replace && ret >= 0) { + int ret2; + + ret2 = sync_write_pointer_for_zoned(sctx, + chunk_logical + offset, + map->stripes[stripe_index].physical, + physical_end); + if (ret2) + ret = ret2; + } + + return ret < 0 ? ret : 0; +} + +static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, + struct btrfs_block_group *bg, + struct btrfs_device *scrub_dev, + u64 dev_offset, + u64 dev_extent_len) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + int i; + int ret = 0; + + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, bg->start, bg->length); + read_unlock(&map_tree->lock); + + if (!em) { + /* + * Might have been an unused block group deleted by the cleaner + * kthread or relocation. + */ + spin_lock(&bg->lock); + if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) + ret = -EINVAL; + spin_unlock(&bg->lock); + + return ret; + } + if (em->start != bg->start) + goto out; + if (em->len < dev_extent_len) + goto out; + + map = em->map_lookup; + for (i = 0; i < map->num_stripes; ++i) { + if (map->stripes[i].dev->bdev == scrub_dev->bdev && + map->stripes[i].physical == dev_offset) { + ret = scrub_stripe(sctx, bg, em, scrub_dev, i); + if (ret) + goto out; + } + } +out: + free_extent_map(em); + + return ret; +} + +static int finish_extent_writes_for_zoned(struct btrfs_root *root, + struct btrfs_block_group *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_trans_handle *trans; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + btrfs_wait_block_group_reservations(cache); + btrfs_wait_nocow_writers(cache); + btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + return btrfs_commit_transaction(trans); +} + +static noinline_for_stack +int scrub_enumerate_chunks(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev, u64 start, u64 end) +{ + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_path *path; + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_root *root = fs_info->dev_root; + u64 chunk_offset; + int ret = 0; + int ro_set; + int slot; + struct extent_buffer *l; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_block_group *cache; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = READA_FORWARD; + path->search_commit_root = 1; + path->skip_locking = 1; + + key.objectid = scrub_dev->devid; + key.offset = 0ull; + key.type = BTRFS_DEV_EXTENT_KEY; + + while (1) { + u64 dev_extent_len; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] >= + btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + break; + } + } else { + ret = 0; + } + } + + l = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(l, &found_key, slot); + + if (found_key.objectid != scrub_dev->devid) + break; + + if (found_key.type != BTRFS_DEV_EXTENT_KEY) + break; + + if (found_key.offset >= end) + break; + + if (found_key.offset < key.offset) + break; + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + dev_extent_len = btrfs_dev_extent_length(l, dev_extent); + + if (found_key.offset + dev_extent_len <= start) + goto skip; + + chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); + + /* + * get a reference on the corresponding block group to prevent + * the chunk from going away while we scrub it + */ + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + + /* some chunks are removed but not committed to disk yet, + * continue scrubbing */ + if (!cache) + goto skip; + + ASSERT(cache->start <= chunk_offset); + /* + * We are using the commit root to search for device extents, so + * that means we could have found a device extent item from a + * block group that was deleted in the current transaction. The + * logical start offset of the deleted block group, stored at + * @chunk_offset, might be part of the logical address range of + * a new block group (which uses different physical extents). + * In this case btrfs_lookup_block_group() has returned the new + * block group, and its start address is less than @chunk_offset. + * + * We skip such new block groups, because it's pointless to + * process them, as we won't find their extents because we search + * for them using the commit root of the extent tree. For a device + * replace it's also fine to skip it, we won't miss copying them + * to the target device because we have the write duplication + * setup through the regular write path (by btrfs_map_block()), + * and we have committed a transaction when we started the device + * replace, right after setting up the device replace state. + */ + if (cache->start < chunk_offset) { + btrfs_put_block_group(cache); + goto skip; + } + + if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { + if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) { + btrfs_put_block_group(cache); + goto skip; + } + } + + /* + * Make sure that while we are scrubbing the corresponding block + * group doesn't get its logical address and its device extents + * reused for another block group, which can possibly be of a + * different type and different profile. We do this to prevent + * false error detections and crashes due to bogus attempts to + * repair extents. + */ + spin_lock(&cache->lock); + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { + spin_unlock(&cache->lock); + btrfs_put_block_group(cache); + goto skip; + } + btrfs_freeze_block_group(cache); + spin_unlock(&cache->lock); + + /* + * we need call btrfs_inc_block_group_ro() with scrubs_paused, + * to avoid deadlock caused by: + * btrfs_inc_block_group_ro() + * -> btrfs_wait_for_commit() + * -> btrfs_commit_transaction() + * -> btrfs_scrub_pause() + */ + scrub_pause_on(fs_info); + + /* + * Don't do chunk preallocation for scrub. + * + * This is especially important for SYSTEM bgs, or we can hit + * -EFBIG from btrfs_finish_chunk_alloc() like: + * 1. The only SYSTEM bg is marked RO. + * Since SYSTEM bg is small, that's pretty common. + * 2. New SYSTEM bg will be allocated + * Due to regular version will allocate new chunk. + * 3. New SYSTEM bg is empty and will get cleaned up + * Before cleanup really happens, it's marked RO again. + * 4. Empty SYSTEM bg get scrubbed + * We go back to 2. + * + * This can easily boost the amount of SYSTEM chunks if cleaner + * thread can't be triggered fast enough, and use up all space + * of btrfs_super_block::sys_chunk_array + * + * While for dev replace, we need to try our best to mark block + * group RO, to prevent race between: + * - Write duplication + * Contains latest data + * - Scrub copy + * Contains data from commit tree + * + * If target block group is not marked RO, nocow writes can + * be overwritten by scrub copy, causing data corruption. + * So for dev-replace, it's not allowed to continue if a block + * group is not RO. + */ + ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); + if (!ret && sctx->is_dev_replace) { + ret = finish_extent_writes_for_zoned(root, cache); + if (ret) { + btrfs_dec_block_group_ro(cache); + scrub_pause_off(fs_info); + btrfs_put_block_group(cache); + break; + } + } + + if (ret == 0) { + ro_set = 1; + } else if (ret == -ENOSPC && !sctx->is_dev_replace && + !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) { + /* + * btrfs_inc_block_group_ro return -ENOSPC when it + * failed in creating new chunk for metadata. + * It is not a problem for scrub, because + * metadata are always cowed, and our scrub paused + * commit_transactions. + * + * For RAID56 chunks, we have to mark them read-only + * for scrub, as later we would use our own cache + * out of RAID56 realm. + * Thus we want the RAID56 bg to be marked RO to + * prevent RMW from screwing up out cache. + */ + ro_set = 0; + } else if (ret == -ETXTBSY) { + btrfs_warn(fs_info, + "skipping scrub of block group %llu due to active swapfile", + cache->start); + scrub_pause_off(fs_info); + ret = 0; + goto skip_unfreeze; + } else { + btrfs_warn(fs_info, + "failed setting block group ro: %d", ret); + btrfs_unfreeze_block_group(cache); + btrfs_put_block_group(cache); + scrub_pause_off(fs_info); + break; + } + + /* + * Now the target block is marked RO, wait for nocow writes to + * finish before dev-replace. + * COW is fine, as COW never overwrites extents in commit tree. + */ + if (sctx->is_dev_replace) { + btrfs_wait_nocow_writers(cache); + btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, + cache->length); + } + + scrub_pause_off(fs_info); + down_write(&dev_replace->rwsem); + dev_replace->cursor_right = found_key.offset + dev_extent_len; + dev_replace->cursor_left = found_key.offset; + dev_replace->item_needs_writeback = 1; + up_write(&dev_replace->rwsem); + + ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, + dev_extent_len); + if (sctx->is_dev_replace && + !btrfs_finish_block_group_to_copy(dev_replace->srcdev, + cache, found_key.offset)) + ro_set = 0; + + down_write(&dev_replace->rwsem); + dev_replace->cursor_left = dev_replace->cursor_right; + dev_replace->item_needs_writeback = 1; + up_write(&dev_replace->rwsem); + + if (ro_set) + btrfs_dec_block_group_ro(cache); + + /* + * We might have prevented the cleaner kthread from deleting + * this block group if it was already unused because we raced + * and set it to RO mode first. So add it back to the unused + * list, otherwise it might not ever be deleted unless a manual + * balance is triggered or it becomes used and unused again. + */ + spin_lock(&cache->lock); + if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) && + !cache->ro && cache->reserved == 0 && cache->used == 0) { + spin_unlock(&cache->lock); + if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_discard_queue_work(&fs_info->discard_ctl, + cache); + else + btrfs_mark_bg_unused(cache); + } else { + spin_unlock(&cache->lock); + } +skip_unfreeze: + btrfs_unfreeze_block_group(cache); + btrfs_put_block_group(cache); + if (ret) + break; + if (sctx->is_dev_replace && + atomic64_read(&dev_replace->num_write_errors) > 0) { + ret = -EIO; + break; + } + if (sctx->stat.malloc_errors > 0) { + ret = -ENOMEM; + break; + } +skip: + key.offset = found_key.offset + dev_extent_len; + btrfs_release_path(path); + } + + btrfs_free_path(path); + + return ret; +} + +static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, + struct page *page, u64 physical, u64 generation) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct bio_vec bvec; + struct bio bio; + struct btrfs_super_block *sb = page_address(page); + int ret; + + bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ); + bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT; + __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0); + ret = submit_bio_wait(&bio); + bio_uninit(&bio); + + if (ret < 0) + return ret; + ret = btrfs_check_super_csum(fs_info, sb); + if (ret != 0) { + btrfs_err_rl(fs_info, + "super block at physical %llu devid %llu has bad csum", + physical, dev->devid); + return -EIO; + } + if (btrfs_super_generation(sb) != generation) { + btrfs_err_rl(fs_info, +"super block at physical %llu devid %llu has bad generation %llu expect %llu", + physical, dev->devid, + btrfs_super_generation(sb), generation); + return -EUCLEAN; + } + + return btrfs_validate_super(fs_info, sb, -1); +} + +static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev) +{ + int i; + u64 bytenr; + u64 gen; + int ret = 0; + struct page *page; + struct btrfs_fs_info *fs_info = sctx->fs_info; + + if (BTRFS_FS_ERROR(fs_info)) + return -EROFS; + + page = alloc_page(GFP_KERNEL); + if (!page) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + + /* Seed devices of a new filesystem has their own generation. */ + if (scrub_dev->fs_devices != fs_info->fs_devices) + gen = scrub_dev->generation; + else + gen = fs_info->last_trans_committed; + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE > + scrub_dev->commit_total_bytes) + break; + if (!btrfs_check_super_location(scrub_dev, bytenr)) + continue; + + ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen); + if (ret) { + spin_lock(&sctx->stat_lock); + sctx->stat.super_errors++; + spin_unlock(&sctx->stat_lock); + } + } + __free_page(page); + return 0; +} + +static void scrub_workers_put(struct btrfs_fs_info *fs_info) +{ + if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, + &fs_info->scrub_lock)) { + struct workqueue_struct *scrub_workers = fs_info->scrub_workers; + + fs_info->scrub_workers = NULL; + mutex_unlock(&fs_info->scrub_lock); + + if (scrub_workers) + destroy_workqueue(scrub_workers); + } +} + +/* + * get a reference count on fs_info->scrub_workers. start worker if necessary + */ +static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) +{ + struct workqueue_struct *scrub_workers = NULL; + unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; + int max_active = fs_info->thread_pool_size; + int ret = -ENOMEM; + + if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) + return 0; + + scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); + if (!scrub_workers) + return -ENOMEM; + + mutex_lock(&fs_info->scrub_lock); + if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { + ASSERT(fs_info->scrub_workers == NULL); + fs_info->scrub_workers = scrub_workers; + refcount_set(&fs_info->scrub_workers_refcnt, 1); + mutex_unlock(&fs_info->scrub_lock); + return 0; + } + /* Other thread raced in and created the workers for us */ + refcount_inc(&fs_info->scrub_workers_refcnt); + mutex_unlock(&fs_info->scrub_lock); + + ret = 0; + + destroy_workqueue(scrub_workers); + return ret; +} + +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + u64 end, struct btrfs_scrub_progress *progress, + int readonly, int is_dev_replace) +{ + struct btrfs_dev_lookup_args args = { .devid = devid }; + struct scrub_ctx *sctx; + int ret; + struct btrfs_device *dev; + unsigned int nofs_flag; + bool need_commit = false; + + if (btrfs_fs_closing(fs_info)) + return -EAGAIN; + + /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */ + ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN); + + /* + * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible + * value (max nodesize / min sectorsize), thus nodesize should always + * be fine. + */ + ASSERT(fs_info->nodesize <= + SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits); + + /* Allocate outside of device_list_mutex */ + sctx = scrub_setup_ctx(fs_info, is_dev_replace); + if (IS_ERR(sctx)) + return PTR_ERR(sctx); + + ret = scrub_workers_get(fs_info); + if (ret) + goto out_free_ctx; + + mutex_lock(&fs_info->fs_devices->device_list_mutex); + dev = btrfs_find_device(fs_info->fs_devices, &args); + if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && + !is_dev_replace)) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + ret = -ENODEV; + goto out; + } + + if (!is_dev_replace && !readonly && + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + btrfs_err_in_rcu(fs_info, + "scrub on devid %llu: filesystem on %s is not writable", + devid, btrfs_dev_name(dev)); + ret = -EROFS; + goto out; + } + + mutex_lock(&fs_info->scrub_lock); + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) { + mutex_unlock(&fs_info->scrub_lock); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + ret = -EIO; + goto out; + } + + down_read(&fs_info->dev_replace.rwsem); + if (dev->scrub_ctx || + (!is_dev_replace && + btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { + up_read(&fs_info->dev_replace.rwsem); + mutex_unlock(&fs_info->scrub_lock); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + ret = -EINPROGRESS; + goto out; + } + up_read(&fs_info->dev_replace.rwsem); + + sctx->readonly = readonly; + dev->scrub_ctx = sctx; + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + /* + * checking @scrub_pause_req here, we can avoid + * race between committing transaction and scrubbing. + */ + __scrub_blocked_if_needed(fs_info); + atomic_inc(&fs_info->scrubs_running); + mutex_unlock(&fs_info->scrub_lock); + + /* + * In order to avoid deadlock with reclaim when there is a transaction + * trying to pause scrub, make sure we use GFP_NOFS for all the + * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity() + * invoked by our callees. The pausing request is done when the + * transaction commit starts, and it blocks the transaction until scrub + * is paused (done at specific points at scrub_stripe() or right above + * before incrementing fs_info->scrubs_running). + */ + nofs_flag = memalloc_nofs_save(); + if (!is_dev_replace) { + u64 old_super_errors; + + spin_lock(&sctx->stat_lock); + old_super_errors = sctx->stat.super_errors; + spin_unlock(&sctx->stat_lock); + + btrfs_info(fs_info, "scrub: started on devid %llu", devid); + /* + * by holding device list mutex, we can + * kick off writing super in log tree sync. + */ + mutex_lock(&fs_info->fs_devices->device_list_mutex); + ret = scrub_supers(sctx, dev); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + spin_lock(&sctx->stat_lock); + /* + * Super block errors found, but we can not commit transaction + * at current context, since btrfs_commit_transaction() needs + * to pause the current running scrub (hold by ourselves). + */ + if (sctx->stat.super_errors > old_super_errors && !sctx->readonly) + need_commit = true; + spin_unlock(&sctx->stat_lock); + } + + if (!ret) + ret = scrub_enumerate_chunks(sctx, dev, start, end); + memalloc_nofs_restore(nofs_flag); + + atomic_dec(&fs_info->scrubs_running); + wake_up(&fs_info->scrub_pause_wait); + + if (progress) + memcpy(progress, &sctx->stat, sizeof(*progress)); + + if (!is_dev_replace) + btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d", + ret ? "not finished" : "finished", devid, ret); + + mutex_lock(&fs_info->scrub_lock); + dev->scrub_ctx = NULL; + mutex_unlock(&fs_info->scrub_lock); + + scrub_workers_put(fs_info); + scrub_put_ctx(sctx); + + /* + * We found some super block errors before, now try to force a + * transaction commit, as scrub has finished. + */ + if (need_commit) { + struct btrfs_trans_handle *trans; + + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_err(fs_info, + "scrub: failed to start transaction to fix super block errors: %d", ret); + return ret; + } + ret = btrfs_commit_transaction(trans); + if (ret < 0) + btrfs_err(fs_info, + "scrub: failed to commit transaction to fix super block errors: %d", ret); + } + return ret; +out: + scrub_workers_put(fs_info); +out_free_ctx: + scrub_free_ctx(sctx); + + return ret; +} + +void btrfs_scrub_pause(struct btrfs_fs_info *fs_info) +{ + mutex_lock(&fs_info->scrub_lock); + atomic_inc(&fs_info->scrub_pause_req); + while (atomic_read(&fs_info->scrubs_paused) != + atomic_read(&fs_info->scrubs_running)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrubs_paused) == + atomic_read(&fs_info->scrubs_running)); + mutex_lock(&fs_info->scrub_lock); + } + mutex_unlock(&fs_info->scrub_lock); +} + +void btrfs_scrub_continue(struct btrfs_fs_info *fs_info) +{ + atomic_dec(&fs_info->scrub_pause_req); + wake_up(&fs_info->scrub_pause_wait); +} + +int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) +{ + mutex_lock(&fs_info->scrub_lock); + if (!atomic_read(&fs_info->scrubs_running)) { + mutex_unlock(&fs_info->scrub_lock); + return -ENOTCONN; + } + + atomic_inc(&fs_info->scrub_cancel_req); + while (atomic_read(&fs_info->scrubs_running)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrubs_running) == 0); + mutex_lock(&fs_info->scrub_lock); + } + atomic_dec(&fs_info->scrub_cancel_req); + mutex_unlock(&fs_info->scrub_lock); + + return 0; +} + +int btrfs_scrub_cancel_dev(struct btrfs_device *dev) +{ + struct btrfs_fs_info *fs_info = dev->fs_info; + struct scrub_ctx *sctx; + + mutex_lock(&fs_info->scrub_lock); + sctx = dev->scrub_ctx; + if (!sctx) { + mutex_unlock(&fs_info->scrub_lock); + return -ENOTCONN; + } + atomic_inc(&sctx->cancel_req); + while (dev->scrub_ctx) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + dev->scrub_ctx == NULL); + mutex_lock(&fs_info->scrub_lock); + } + mutex_unlock(&fs_info->scrub_lock); + + return 0; +} + +int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, + struct btrfs_scrub_progress *progress) +{ + struct btrfs_dev_lookup_args args = { .devid = devid }; + struct btrfs_device *dev; + struct scrub_ctx *sctx = NULL; + + mutex_lock(&fs_info->fs_devices->device_list_mutex); + dev = btrfs_find_device(fs_info->fs_devices, &args); + if (dev) + sctx = dev->scrub_ctx; + if (sctx) + memcpy(progress, &sctx->stat, sizeof(*progress)); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; +} diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h new file mode 100644 index 0000000000..7639103ebf --- /dev/null +++ b/fs/btrfs/scrub.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SCRUB_H +#define BTRFS_SCRUB_H + +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + u64 end, struct btrfs_scrub_progress *progress, + int readonly, int is_dev_replace); +void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); +void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); +int btrfs_scrub_cancel(struct btrfs_fs_info *info); +int btrfs_scrub_cancel_dev(struct btrfs_device *dev); +int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, + struct btrfs_scrub_progress *progress); + +#endif diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c new file mode 100644 index 0000000000..db94eefda2 --- /dev/null +++ b/fs/btrfs/send.c @@ -0,0 +1,8413 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2012 Alexander Block. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "send.h" +#include "ctree.h" +#include "backref.h" +#include "locking.h" +#include "disk-io.h" +#include "btrfs_inode.h" +#include "transaction.h" +#include "compression.h" +#include "xattr.h" +#include "print-tree.h" +#include "accessors.h" +#include "dir-item.h" +#include "file-item.h" +#include "ioctl.h" +#include "verity.h" +#include "lru_cache.h" + +/* + * Maximum number of references an extent can have in order for us to attempt to + * issue clone operations instead of write operations. This currently exists to + * avoid hitting limitations of the backreference walking code (taking a lot of + * time and using too much memory for extents with large number of references). + */ +#define SEND_MAX_EXTENT_REFS 1024 + +/* + * A fs_path is a helper to dynamically build path names with unknown size. + * It reallocates the internal buffer on demand. + * It allows fast adding of path elements on the right side (normal path) and + * fast adding to the left side (reversed path). A reversed path can also be + * unreversed if needed. + */ +struct fs_path { + union { + struct { + char *start; + char *end; + + char *buf; + unsigned short buf_len:15; + unsigned short reversed:1; + char inline_buf[]; + }; + /* + * Average path length does not exceed 200 bytes, we'll have + * better packing in the slab and higher chance to satisfy + * a allocation later during send. + */ + char pad[256]; + }; +}; +#define FS_PATH_INLINE_SIZE \ + (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf)) + + +/* reused for each extent */ +struct clone_root { + struct btrfs_root *root; + u64 ino; + u64 offset; + u64 num_bytes; + bool found_ref; +}; + +#define SEND_MAX_NAME_CACHE_SIZE 256 + +/* + * Limit the root_ids array of struct backref_cache_entry to 17 elements. + * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which + * can be satisfied from the kmalloc-192 slab, without wasting any space. + * The most common case is to have a single root for cloning, which corresponds + * to the send root. Having the user specify more than 16 clone roots is not + * common, and in such rare cases we simply don't use caching if the number of + * cloning roots that lead down to a leaf is more than 17. + */ +#define SEND_MAX_BACKREF_CACHE_ROOTS 17 + +/* + * Max number of entries in the cache. + * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding + * maple tree's internal nodes, is 24K. + */ +#define SEND_MAX_BACKREF_CACHE_SIZE 128 + +/* + * A backref cache entry maps a leaf to a list of IDs of roots from which the + * leaf is accessible and we can use for clone operations. + * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on + * x86_64). + */ +struct backref_cache_entry { + struct btrfs_lru_cache_entry entry; + u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS]; + /* Number of valid elements in the root_ids array. */ + int num_roots; +}; + +/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ +static_assert(offsetof(struct backref_cache_entry, entry) == 0); + +/* + * Max number of entries in the cache that stores directories that were already + * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses + * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but + * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). + */ +#define SEND_MAX_DIR_CREATED_CACHE_SIZE 64 + +/* + * Max number of entries in the cache that stores directories that were already + * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses + * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but + * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). + */ +#define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64 + +struct send_ctx { + struct file *send_filp; + loff_t send_off; + char *send_buf; + u32 send_size; + u32 send_max_size; + /* + * Whether BTRFS_SEND_A_DATA attribute was already added to current + * command (since protocol v2, data must be the last attribute). + */ + bool put_data; + struct page **send_buf_pages; + u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ + /* Protocol version compatibility requested */ + u32 proto; + + struct btrfs_root *send_root; + struct btrfs_root *parent_root; + struct clone_root *clone_roots; + int clone_roots_cnt; + + /* current state of the compare_tree call */ + struct btrfs_path *left_path; + struct btrfs_path *right_path; + struct btrfs_key *cmp_key; + + /* + * Keep track of the generation of the last transaction that was used + * for relocating a block group. This is periodically checked in order + * to detect if a relocation happened since the last check, so that we + * don't operate on stale extent buffers for nodes (level >= 1) or on + * stale disk_bytenr values of file extent items. + */ + u64 last_reloc_trans; + + /* + * infos of the currently processed inode. In case of deleted inodes, + * these are the values from the deleted inode. + */ + u64 cur_ino; + u64 cur_inode_gen; + u64 cur_inode_size; + u64 cur_inode_mode; + u64 cur_inode_rdev; + u64 cur_inode_last_extent; + u64 cur_inode_next_write_offset; + bool cur_inode_new; + bool cur_inode_new_gen; + bool cur_inode_deleted; + bool ignore_cur_inode; + bool cur_inode_needs_verity; + void *verity_descriptor; + + u64 send_progress; + + struct list_head new_refs; + struct list_head deleted_refs; + + struct btrfs_lru_cache name_cache; + + /* + * The inode we are currently processing. It's not NULL only when we + * need to issue write commands for data extents from this inode. + */ + struct inode *cur_inode; + struct file_ra_state ra; + u64 page_cache_clear_start; + bool clean_page_cache; + + /* + * We process inodes by their increasing order, so if before an + * incremental send we reverse the parent/child relationship of + * directories such that a directory with a lower inode number was + * the parent of a directory with a higher inode number, and the one + * becoming the new parent got renamed too, we can't rename/move the + * directory with lower inode number when we finish processing it - we + * must process the directory with higher inode number first, then + * rename/move it and then rename/move the directory with lower inode + * number. Example follows. + * + * Tree state when the first send was performed: + * + * . + * |-- a (ino 257) + * |-- b (ino 258) + * | + * | + * |-- c (ino 259) + * | |-- d (ino 260) + * | + * |-- c2 (ino 261) + * + * Tree state when the second (incremental) send is performed: + * + * . + * |-- a (ino 257) + * |-- b (ino 258) + * |-- c2 (ino 261) + * |-- d2 (ino 260) + * |-- cc (ino 259) + * + * The sequence of steps that lead to the second state was: + * + * mv /a/b/c/d /a/b/c2/d2 + * mv /a/b/c /a/b/c2/d2/cc + * + * "c" has lower inode number, but we can't move it (2nd mv operation) + * before we move "d", which has higher inode number. + * + * So we just memorize which move/rename operations must be performed + * later when their respective parent is processed and moved/renamed. + */ + + /* Indexed by parent directory inode number. */ + struct rb_root pending_dir_moves; + + /* + * Reverse index, indexed by the inode number of a directory that + * is waiting for the move/rename of its immediate parent before its + * own move/rename can be performed. + */ + struct rb_root waiting_dir_moves; + + /* + * A directory that is going to be rm'ed might have a child directory + * which is in the pending directory moves index above. In this case, + * the directory can only be removed after the move/rename of its child + * is performed. Example: + * + * Parent snapshot: + * + * . (ino 256) + * |-- a/ (ino 257) + * |-- b/ (ino 258) + * |-- c/ (ino 259) + * | |-- x/ (ino 260) + * | + * |-- y/ (ino 261) + * + * Send snapshot: + * + * . (ino 256) + * |-- a/ (ino 257) + * |-- b/ (ino 258) + * |-- YY/ (ino 261) + * |-- x/ (ino 260) + * + * Sequence of steps that lead to the send snapshot: + * rm -f /a/b/c/foo.txt + * mv /a/b/y /a/b/YY + * mv /a/b/c/x /a/b/YY + * rmdir /a/b/c + * + * When the child is processed, its move/rename is delayed until its + * parent is processed (as explained above), but all other operations + * like update utimes, chown, chgrp, etc, are performed and the paths + * that it uses for those operations must use the orphanized name of + * its parent (the directory we're going to rm later), so we need to + * memorize that name. + * + * Indexed by the inode number of the directory to be deleted. + */ + struct rb_root orphan_dirs; + + struct rb_root rbtree_new_refs; + struct rb_root rbtree_deleted_refs; + + struct btrfs_lru_cache backref_cache; + u64 backref_cache_last_reloc_trans; + + struct btrfs_lru_cache dir_created_cache; + struct btrfs_lru_cache dir_utimes_cache; +}; + +struct pending_dir_move { + struct rb_node node; + struct list_head list; + u64 parent_ino; + u64 ino; + u64 gen; + struct list_head update_refs; +}; + +struct waiting_dir_move { + struct rb_node node; + u64 ino; + /* + * There might be some directory that could not be removed because it + * was waiting for this directory inode to be moved first. Therefore + * after this directory is moved, we can try to rmdir the ino rmdir_ino. + */ + u64 rmdir_ino; + u64 rmdir_gen; + bool orphanized; +}; + +struct orphan_dir_info { + struct rb_node node; + u64 ino; + u64 gen; + u64 last_dir_index_offset; + u64 dir_high_seq_ino; +}; + +struct name_cache_entry { + /* + * The key in the entry is an inode number, and the generation matches + * the inode's generation. + */ + struct btrfs_lru_cache_entry entry; + u64 parent_ino; + u64 parent_gen; + int ret; + int need_later_update; + int name_len; + char name[]; +}; + +/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ +static_assert(offsetof(struct name_cache_entry, entry) == 0); + +#define ADVANCE 1 +#define ADVANCE_ONLY_NEXT -1 + +enum btrfs_compare_tree_result { + BTRFS_COMPARE_TREE_NEW, + BTRFS_COMPARE_TREE_DELETED, + BTRFS_COMPARE_TREE_CHANGED, + BTRFS_COMPARE_TREE_SAME, +}; + +__cold +static void inconsistent_snapshot_error(struct send_ctx *sctx, + enum btrfs_compare_tree_result result, + const char *what) +{ + const char *result_string; + + switch (result) { + case BTRFS_COMPARE_TREE_NEW: + result_string = "new"; + break; + case BTRFS_COMPARE_TREE_DELETED: + result_string = "deleted"; + break; + case BTRFS_COMPARE_TREE_CHANGED: + result_string = "updated"; + break; + case BTRFS_COMPARE_TREE_SAME: + ASSERT(0); + result_string = "unchanged"; + break; + default: + ASSERT(0); + result_string = "unexpected"; + } + + btrfs_err(sctx->send_root->fs_info, + "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu", + result_string, what, sctx->cmp_key->objectid, + sctx->send_root->root_key.objectid, + (sctx->parent_root ? + sctx->parent_root->root_key.objectid : 0)); +} + +__maybe_unused +static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd) +{ + switch (sctx->proto) { + case 1: return cmd <= BTRFS_SEND_C_MAX_V1; + case 2: return cmd <= BTRFS_SEND_C_MAX_V2; + case 3: return cmd <= BTRFS_SEND_C_MAX_V3; + default: return false; + } +} + +static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); + +static struct waiting_dir_move * +get_waiting_dir_move(struct send_ctx *sctx, u64 ino); + +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen); + +static int need_send_hole(struct send_ctx *sctx) +{ + return (sctx->parent_root && !sctx->cur_inode_new && + !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted && + S_ISREG(sctx->cur_inode_mode)); +} + +static void fs_path_reset(struct fs_path *p) +{ + if (p->reversed) { + p->start = p->buf + p->buf_len - 1; + p->end = p->start; + *p->start = 0; + } else { + p->start = p->buf; + p->end = p->start; + *p->start = 0; + } +} + +static struct fs_path *fs_path_alloc(void) +{ + struct fs_path *p; + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return NULL; + p->reversed = 0; + p->buf = p->inline_buf; + p->buf_len = FS_PATH_INLINE_SIZE; + fs_path_reset(p); + return p; +} + +static struct fs_path *fs_path_alloc_reversed(void) +{ + struct fs_path *p; + + p = fs_path_alloc(); + if (!p) + return NULL; + p->reversed = 1; + fs_path_reset(p); + return p; +} + +static void fs_path_free(struct fs_path *p) +{ + if (!p) + return; + if (p->buf != p->inline_buf) + kfree(p->buf); + kfree(p); +} + +static int fs_path_len(struct fs_path *p) +{ + return p->end - p->start; +} + +static int fs_path_ensure_buf(struct fs_path *p, int len) +{ + char *tmp_buf; + int path_len; + int old_buf_len; + + len++; + + if (p->buf_len >= len) + return 0; + + if (len > PATH_MAX) { + WARN_ON(1); + return -ENOMEM; + } + + path_len = p->end - p->start; + old_buf_len = p->buf_len; + + /* + * Allocate to the next largest kmalloc bucket size, to let + * the fast path happen most of the time. + */ + len = kmalloc_size_roundup(len); + /* + * First time the inline_buf does not suffice + */ + if (p->buf == p->inline_buf) { + tmp_buf = kmalloc(len, GFP_KERNEL); + if (tmp_buf) + memcpy(tmp_buf, p->buf, old_buf_len); + } else { + tmp_buf = krealloc(p->buf, len, GFP_KERNEL); + } + if (!tmp_buf) + return -ENOMEM; + p->buf = tmp_buf; + p->buf_len = len; + + if (p->reversed) { + tmp_buf = p->buf + old_buf_len - path_len - 1; + p->end = p->buf + p->buf_len - 1; + p->start = p->end - path_len; + memmove(p->start, tmp_buf, path_len + 1); + } else { + p->start = p->buf; + p->end = p->start + path_len; + } + return 0; +} + +static int fs_path_prepare_for_add(struct fs_path *p, int name_len, + char **prepared) +{ + int ret; + int new_len; + + new_len = p->end - p->start + name_len; + if (p->start != p->end) + new_len++; + ret = fs_path_ensure_buf(p, new_len); + if (ret < 0) + goto out; + + if (p->reversed) { + if (p->start != p->end) + *--p->start = '/'; + p->start -= name_len; + *prepared = p->start; + } else { + if (p->start != p->end) + *p->end++ = '/'; + *prepared = p->end; + p->end += name_len; + *p->end = 0; + } + +out: + return ret; +} + +static int fs_path_add(struct fs_path *p, const char *name, int name_len) +{ + int ret; + char *prepared; + + ret = fs_path_prepare_for_add(p, name_len, &prepared); + if (ret < 0) + goto out; + memcpy(prepared, name, name_len); + +out: + return ret; +} + +static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) +{ + int ret; + char *prepared; + + ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared); + if (ret < 0) + goto out; + memcpy(prepared, p2->start, p2->end - p2->start); + +out: + return ret; +} + +static int fs_path_add_from_extent_buffer(struct fs_path *p, + struct extent_buffer *eb, + unsigned long off, int len) +{ + int ret; + char *prepared; + + ret = fs_path_prepare_for_add(p, len, &prepared); + if (ret < 0) + goto out; + + read_extent_buffer(eb, prepared, off, len); + +out: + return ret; +} + +static int fs_path_copy(struct fs_path *p, struct fs_path *from) +{ + p->reversed = from->reversed; + fs_path_reset(p); + + return fs_path_add_path(p, from); +} + +static void fs_path_unreverse(struct fs_path *p) +{ + char *tmp; + int len; + + if (!p->reversed) + return; + + tmp = p->start; + len = p->end - p->start; + p->start = p->buf; + p->end = p->start + len; + memmove(p->start, tmp, len + 1); + p->reversed = 0; +} + +static struct btrfs_path *alloc_path_for_send(void) +{ + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return NULL; + path->search_commit_root = 1; + path->skip_locking = 1; + path->need_commit_sem = 1; + return path; +} + +static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) +{ + int ret; + u32 pos = 0; + + while (pos < len) { + ret = kernel_write(filp, buf + pos, len - pos, off); + if (ret < 0) + return ret; + if (ret == 0) + return -EIO; + pos += ret; + } + + return 0; +} + +static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) +{ + struct btrfs_tlv_header *hdr; + int total_len = sizeof(*hdr) + len; + int left = sctx->send_max_size - sctx->send_size; + + if (WARN_ON_ONCE(sctx->put_data)) + return -EINVAL; + + if (unlikely(left < total_len)) + return -EOVERFLOW; + + hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size); + put_unaligned_le16(attr, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); + memcpy(hdr + 1, data, len); + sctx->send_size += total_len; + + return 0; +} + +#define TLV_PUT_DEFINE_INT(bits) \ + static int tlv_put_u##bits(struct send_ctx *sctx, \ + u##bits attr, u##bits value) \ + { \ + __le##bits __tmp = cpu_to_le##bits(value); \ + return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ + } + +TLV_PUT_DEFINE_INT(8) +TLV_PUT_DEFINE_INT(32) +TLV_PUT_DEFINE_INT(64) + +static int tlv_put_string(struct send_ctx *sctx, u16 attr, + const char *str, int len) +{ + if (len == -1) + len = strlen(str); + return tlv_put(sctx, attr, str, len); +} + +static int tlv_put_uuid(struct send_ctx *sctx, u16 attr, + const u8 *uuid) +{ + return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE); +} + +static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, + struct extent_buffer *eb, + struct btrfs_timespec *ts) +{ + struct btrfs_timespec bts; + read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts)); + return tlv_put(sctx, attr, &bts, sizeof(bts)); +} + + +#define TLV_PUT(sctx, attrtype, data, attrlen) \ + do { \ + ret = tlv_put(sctx, attrtype, data, attrlen); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) + +#define TLV_PUT_INT(sctx, attrtype, bits, value) \ + do { \ + ret = tlv_put_u##bits(sctx, attrtype, value); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) + +#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data) +#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data) +#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data) +#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data) +#define TLV_PUT_STRING(sctx, attrtype, str, len) \ + do { \ + ret = tlv_put_string(sctx, attrtype, str, len); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) +#define TLV_PUT_PATH(sctx, attrtype, p) \ + do { \ + ret = tlv_put_string(sctx, attrtype, p->start, \ + p->end - p->start); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while(0) +#define TLV_PUT_UUID(sctx, attrtype, uuid) \ + do { \ + ret = tlv_put_uuid(sctx, attrtype, uuid); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) +#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \ + do { \ + ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) + +static int send_header(struct send_ctx *sctx) +{ + struct btrfs_stream_header hdr; + + strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); + hdr.version = cpu_to_le32(sctx->proto); + return write_buf(sctx->send_filp, &hdr, sizeof(hdr), + &sctx->send_off); +} + +/* + * For each command/item we want to send to userspace, we call this function. + */ +static int begin_cmd(struct send_ctx *sctx, int cmd) +{ + struct btrfs_cmd_header *hdr; + + if (WARN_ON(!sctx->send_buf)) + return -EINVAL; + + BUG_ON(sctx->send_size); + + sctx->send_size += sizeof(*hdr); + hdr = (struct btrfs_cmd_header *)sctx->send_buf; + put_unaligned_le16(cmd, &hdr->cmd); + + return 0; +} + +static int send_cmd(struct send_ctx *sctx) +{ + int ret; + struct btrfs_cmd_header *hdr; + u32 crc; + + hdr = (struct btrfs_cmd_header *)sctx->send_buf; + put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len); + put_unaligned_le32(0, &hdr->crc); + + crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + put_unaligned_le32(crc, &hdr->crc); + + ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, + &sctx->send_off); + + sctx->send_size = 0; + sctx->put_data = false; + + return ret; +} + +/* + * Sends a move instruction to user space + */ +static int send_rename(struct send_ctx *sctx, + struct fs_path *from, struct fs_path *to) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret; + + btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start); + + ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +/* + * Sends a link instruction to user space + */ +static int send_link(struct send_ctx *sctx, + struct fs_path *path, struct fs_path *lnk) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret; + + btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start); + + ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +/* + * Sends an unlink instruction to user space + */ +static int send_unlink(struct send_ctx *sctx, struct fs_path *path) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret; + + btrfs_debug(fs_info, "send_unlink %s", path->start); + + ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +/* + * Sends a rmdir instruction to user space + */ +static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret; + + btrfs_debug(fs_info, "send_rmdir %s", path->start); + + ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +struct btrfs_inode_info { + u64 size; + u64 gen; + u64 mode; + u64 uid; + u64 gid; + u64 rdev; + u64 fileattr; + u64 nlink; +}; + +/* + * Helper function to retrieve some fields from an inode item. + */ +static int get_inode_info(struct btrfs_root *root, u64 ino, + struct btrfs_inode_info *info) +{ + int ret; + struct btrfs_path *path; + struct btrfs_inode_item *ii; + struct btrfs_key key; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto out; + } + + if (!info) + goto out; + + ii = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + info->size = btrfs_inode_size(path->nodes[0], ii); + info->gen = btrfs_inode_generation(path->nodes[0], ii); + info->mode = btrfs_inode_mode(path->nodes[0], ii); + info->uid = btrfs_inode_uid(path->nodes[0], ii); + info->gid = btrfs_inode_gid(path->nodes[0], ii); + info->rdev = btrfs_inode_rdev(path->nodes[0], ii); + info->nlink = btrfs_inode_nlink(path->nodes[0], ii); + /* + * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's + * otherwise logically split to 32/32 parts. + */ + info->fileattr = btrfs_inode_flags(path->nodes[0], ii); + +out: + btrfs_free_path(path); + return ret; +} + +static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) +{ + int ret; + struct btrfs_inode_info info = { 0 }; + + ASSERT(gen); + + ret = get_inode_info(root, ino, &info); + *gen = info.gen; + return ret; +} + +typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, + struct fs_path *p, + void *ctx); + +/* + * Helper function to iterate the entries in ONE btrfs_inode_ref or + * btrfs_inode_extref. + * The iterate callback may return a non zero value to stop iteration. This can + * be a negative value for error codes or 1 to simply stop it. + * + * path must point to the INODE_REF or INODE_EXTREF when called. + */ +static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *found_key, int resolve, + iterate_inode_ref_t iterate, void *ctx) +{ + struct extent_buffer *eb = path->nodes[0]; + struct btrfs_inode_ref *iref; + struct btrfs_inode_extref *extref; + struct btrfs_path *tmp_path; + struct fs_path *p; + u32 cur = 0; + u32 total; + int slot = path->slots[0]; + u32 name_len; + char *start; + int ret = 0; + int num = 0; + int index; + u64 dir; + unsigned long name_off; + unsigned long elem_size; + unsigned long ptr; + + p = fs_path_alloc_reversed(); + if (!p) + return -ENOMEM; + + tmp_path = alloc_path_for_send(); + if (!tmp_path) { + fs_path_free(p); + return -ENOMEM; + } + + + if (found_key->type == BTRFS_INODE_REF_KEY) { + ptr = (unsigned long)btrfs_item_ptr(eb, slot, + struct btrfs_inode_ref); + total = btrfs_item_size(eb, slot); + elem_size = sizeof(*iref); + } else { + ptr = btrfs_item_ptr_offset(eb, slot); + total = btrfs_item_size(eb, slot); + elem_size = sizeof(*extref); + } + + while (cur < total) { + fs_path_reset(p); + + if (found_key->type == BTRFS_INODE_REF_KEY) { + iref = (struct btrfs_inode_ref *)(ptr + cur); + name_len = btrfs_inode_ref_name_len(eb, iref); + name_off = (unsigned long)(iref + 1); + index = btrfs_inode_ref_index(eb, iref); + dir = found_key->offset; + } else { + extref = (struct btrfs_inode_extref *)(ptr + cur); + name_len = btrfs_inode_extref_name_len(eb, extref); + name_off = (unsigned long)&extref->name; + index = btrfs_inode_extref_index(eb, extref); + dir = btrfs_inode_extref_parent(eb, extref); + } + + if (resolve) { + start = btrfs_ref_to_path(root, tmp_path, name_len, + name_off, eb, dir, + p->buf, p->buf_len); + if (IS_ERR(start)) { + ret = PTR_ERR(start); + goto out; + } + if (start < p->buf) { + /* overflow , try again with larger buffer */ + ret = fs_path_ensure_buf(p, + p->buf_len + p->buf - start); + if (ret < 0) + goto out; + start = btrfs_ref_to_path(root, tmp_path, + name_len, name_off, + eb, dir, + p->buf, p->buf_len); + if (IS_ERR(start)) { + ret = PTR_ERR(start); + goto out; + } + BUG_ON(start < p->buf); + } + p->start = start; + } else { + ret = fs_path_add_from_extent_buffer(p, eb, name_off, + name_len); + if (ret < 0) + goto out; + } + + cur += elem_size + name_len; + ret = iterate(num, dir, index, p, ctx); + if (ret) + goto out; + num++; + } + +out: + btrfs_free_path(tmp_path); + fs_path_free(p); + return ret; +} + +typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + void *ctx); + +/* + * Helper function to iterate the entries in ONE btrfs_dir_item. + * The iterate callback may return a non zero value to stop iteration. This can + * be a negative value for error codes or 1 to simply stop it. + * + * path must point to the dir item when called. + */ +static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, + iterate_dir_item_t iterate, void *ctx) +{ + int ret = 0; + struct extent_buffer *eb; + struct btrfs_dir_item *di; + struct btrfs_key di_key; + char *buf = NULL; + int buf_len; + u32 name_len; + u32 data_len; + u32 cur; + u32 len; + u32 total; + int slot; + int num; + + /* + * Start with a small buffer (1 page). If later we end up needing more + * space, which can happen for xattrs on a fs with a leaf size greater + * then the page size, attempt to increase the buffer. Typically xattr + * values are small. + */ + buf_len = PATH_MAX; + buf = kmalloc(buf_len, GFP_KERNEL); + if (!buf) { + ret = -ENOMEM; + goto out; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + cur = 0; + len = 0; + total = btrfs_item_size(eb, slot); + + num = 0; + while (cur < total) { + name_len = btrfs_dir_name_len(eb, di); + data_len = btrfs_dir_data_len(eb, di); + btrfs_dir_item_key_to_cpu(eb, di, &di_key); + + if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) { + if (name_len > XATTR_NAME_MAX) { + ret = -ENAMETOOLONG; + goto out; + } + if (name_len + data_len > + BTRFS_MAX_XATTR_SIZE(root->fs_info)) { + ret = -E2BIG; + goto out; + } + } else { + /* + * Path too long + */ + if (name_len + data_len > PATH_MAX) { + ret = -ENAMETOOLONG; + goto out; + } + } + + if (name_len + data_len > buf_len) { + buf_len = name_len + data_len; + if (is_vmalloc_addr(buf)) { + vfree(buf); + buf = NULL; + } else { + char *tmp = krealloc(buf, buf_len, + GFP_KERNEL | __GFP_NOWARN); + + if (!tmp) + kfree(buf); + buf = tmp; + } + if (!buf) { + buf = kvmalloc(buf_len, GFP_KERNEL); + if (!buf) { + ret = -ENOMEM; + goto out; + } + } + } + + read_extent_buffer(eb, buf, (unsigned long)(di + 1), + name_len + data_len); + + len = sizeof(*di) + name_len + data_len; + di = (struct btrfs_dir_item *)((char *)di + len); + cur += len; + + ret = iterate(num, &di_key, buf, name_len, buf + name_len, + data_len, ctx); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out; + } + + num++; + } + +out: + kvfree(buf); + return ret; +} + +static int __copy_first_ref(int num, u64 dir, int index, + struct fs_path *p, void *ctx) +{ + int ret; + struct fs_path *pt = ctx; + + ret = fs_path_copy(pt, p); + if (ret < 0) + return ret; + + /* we want the first only */ + return 1; +} + +/* + * Retrieve the first path of an inode. If an inode has more then one + * ref/hardlink, this is ignored. + */ +static int get_inode_path(struct btrfs_root *root, + u64 ino, struct fs_path *path) +{ + int ret; + struct btrfs_key key, found_key; + struct btrfs_path *p; + + p = alloc_path_for_send(); + if (!p) + return -ENOMEM; + + fs_path_reset(path); + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(root, &key, p, 1, 0); + if (ret < 0) + goto out; + if (ret) { + ret = 1; + goto out; + } + btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]); + if (found_key.objectid != ino || + (found_key.type != BTRFS_INODE_REF_KEY && + found_key.type != BTRFS_INODE_EXTREF_KEY)) { + ret = -ENOENT; + goto out; + } + + ret = iterate_inode_ref(root, p, &found_key, 1, + __copy_first_ref, path); + if (ret < 0) + goto out; + ret = 0; + +out: + btrfs_free_path(p); + return ret; +} + +struct backref_ctx { + struct send_ctx *sctx; + + /* number of total found references */ + u64 found; + + /* + * used for clones found in send_root. clones found behind cur_objectid + * and cur_offset are not considered as allowed clones. + */ + u64 cur_objectid; + u64 cur_offset; + + /* may be truncated in case it's the last extent in a file */ + u64 extent_len; + + /* The bytenr the file extent item we are processing refers to. */ + u64 bytenr; + /* The owner (root id) of the data backref for the current extent. */ + u64 backref_owner; + /* The offset of the data backref for the current extent. */ + u64 backref_offset; +}; + +static int __clone_root_cmp_bsearch(const void *key, const void *elt) +{ + u64 root = (u64)(uintptr_t)key; + const struct clone_root *cr = elt; + + if (root < cr->root->root_key.objectid) + return -1; + if (root > cr->root->root_key.objectid) + return 1; + return 0; +} + +static int __clone_root_cmp_sort(const void *e1, const void *e2) +{ + const struct clone_root *cr1 = e1; + const struct clone_root *cr2 = e2; + + if (cr1->root->root_key.objectid < cr2->root->root_key.objectid) + return -1; + if (cr1->root->root_key.objectid > cr2->root->root_key.objectid) + return 1; + return 0; +} + +/* + * Called for every backref that is found for the current extent. + * Results are collected in sctx->clone_roots->ino/offset. + */ +static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id, + void *ctx_) +{ + struct backref_ctx *bctx = ctx_; + struct clone_root *clone_root; + + /* First check if the root is in the list of accepted clone sources */ + clone_root = bsearch((void *)(uintptr_t)root_id, bctx->sctx->clone_roots, + bctx->sctx->clone_roots_cnt, + sizeof(struct clone_root), + __clone_root_cmp_bsearch); + if (!clone_root) + return 0; + + /* This is our own reference, bail out as we can't clone from it. */ + if (clone_root->root == bctx->sctx->send_root && + ino == bctx->cur_objectid && + offset == bctx->cur_offset) + return 0; + + /* + * Make sure we don't consider clones from send_root that are + * behind the current inode/offset. + */ + if (clone_root->root == bctx->sctx->send_root) { + /* + * If the source inode was not yet processed we can't issue a + * clone operation, as the source extent does not exist yet at + * the destination of the stream. + */ + if (ino > bctx->cur_objectid) + return 0; + /* + * We clone from the inode currently being sent as long as the + * source extent is already processed, otherwise we could try + * to clone from an extent that does not exist yet at the + * destination of the stream. + */ + if (ino == bctx->cur_objectid && + offset + bctx->extent_len > + bctx->sctx->cur_inode_next_write_offset) + return 0; + } + + bctx->found++; + clone_root->found_ref = true; + + /* + * If the given backref refers to a file extent item with a larger + * number of bytes than what we found before, use the new one so that + * we clone more optimally and end up doing less writes and getting + * less exclusive, non-shared extents at the destination. + */ + if (num_bytes > clone_root->num_bytes) { + clone_root->ino = ino; + clone_root->offset = offset; + clone_root->num_bytes = num_bytes; + + /* + * Found a perfect candidate, so there's no need to continue + * backref walking. + */ + if (num_bytes >= bctx->extent_len) + return BTRFS_ITERATE_EXTENT_INODES_STOP; + } + + return 0; +} + +static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, + const u64 **root_ids_ret, int *root_count_ret) +{ + struct backref_ctx *bctx = ctx; + struct send_ctx *sctx = bctx->sctx; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; + struct btrfs_lru_cache_entry *raw_entry; + struct backref_cache_entry *entry; + + if (btrfs_lru_cache_size(&sctx->backref_cache) == 0) + return false; + + /* + * If relocation happened since we first filled the cache, then we must + * empty the cache and can not use it, because even though we operate on + * read-only roots, their leaves and nodes may have been reallocated and + * now be used for different nodes/leaves of the same tree or some other + * tree. + * + * We are called from iterate_extent_inodes() while either holding a + * transaction handle or holding fs_info->commit_root_sem, so no need + * to take any lock here. + */ + if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) { + btrfs_lru_cache_clear(&sctx->backref_cache); + return false; + } + + raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0); + if (!raw_entry) + return false; + + entry = container_of(raw_entry, struct backref_cache_entry, entry); + *root_ids_ret = entry->root_ids; + *root_count_ret = entry->num_roots; + + return true; +} + +static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, + void *ctx) +{ + struct backref_ctx *bctx = ctx; + struct send_ctx *sctx = bctx->sctx; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + struct backref_cache_entry *new_entry; + struct ulist_iterator uiter; + struct ulist_node *node; + int ret; + + /* + * We're called while holding a transaction handle or while holding + * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a + * NOFS allocation. + */ + new_entry = kmalloc(sizeof(struct backref_cache_entry), GFP_NOFS); + /* No worries, cache is optional. */ + if (!new_entry) + return; + + new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->entry.gen = 0; + new_entry->num_roots = 0; + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(root_ids, &uiter)) != NULL) { + const u64 root_id = node->val; + struct clone_root *root; + + root = bsearch((void *)(uintptr_t)root_id, sctx->clone_roots, + sctx->clone_roots_cnt, sizeof(struct clone_root), + __clone_root_cmp_bsearch); + if (!root) + continue; + + /* Too many roots, just exit, no worries as caching is optional. */ + if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) { + kfree(new_entry); + return; + } + + new_entry->root_ids[new_entry->num_roots] = root_id; + new_entry->num_roots++; + } + + /* + * We may have not added any roots to the new cache entry, which means + * none of the roots is part of the list of roots from which we are + * allowed to clone. Cache the new entry as it's still useful to avoid + * backref walking to determine which roots have a path to the leaf. + * + * Also use GFP_NOFS because we're called while holding a transaction + * handle or while holding fs_info->commit_root_sem. + */ + ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry, + GFP_NOFS); + ASSERT(ret == 0 || ret == -ENOMEM); + if (ret) { + /* Caching is optional, no worries. */ + kfree(new_entry); + return; + } + + /* + * We are called from iterate_extent_inodes() while either holding a + * transaction handle or holding fs_info->commit_root_sem, so no need + * to take any lock here. + */ + if (btrfs_lru_cache_size(&sctx->backref_cache) == 1) + sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans; +} + +static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, + const struct extent_buffer *leaf, void *ctx) +{ + const u64 refs = btrfs_extent_refs(leaf, ei); + const struct backref_ctx *bctx = ctx; + const struct send_ctx *sctx = bctx->sctx; + + if (bytenr == bctx->bytenr) { + const u64 flags = btrfs_extent_flags(leaf, ei); + + if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) + return -EUCLEAN; + + /* + * If we have only one reference and only the send root as a + * clone source - meaning no clone roots were given in the + * struct btrfs_ioctl_send_args passed to the send ioctl - then + * it's our reference and there's no point in doing backref + * walking which is expensive, so exit early. + */ + if (refs == 1 && sctx->clone_roots_cnt == 1) + return -ENOENT; + } + + /* + * Backreference walking (iterate_extent_inodes() below) is currently + * too expensive when an extent has a large number of references, both + * in time spent and used memory. So for now just fallback to write + * operations instead of clone operations when an extent has more than + * a certain amount of references. + */ + if (refs > SEND_MAX_EXTENT_REFS) + return -ENOENT; + + return 0; +} + +static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx) +{ + const struct backref_ctx *bctx = ctx; + + if (ino == bctx->cur_objectid && + root == bctx->backref_owner && + offset == bctx->backref_offset) + return true; + + return false; +} + +/* + * Given an inode, offset and extent item, it finds a good clone for a clone + * instruction. Returns -ENOENT when none could be found. The function makes + * sure that the returned clone is usable at the point where sending is at the + * moment. This means, that no clones are accepted which lie behind the current + * inode+offset. + * + * path must point to the extent item when called. + */ +static int find_extent_clone(struct send_ctx *sctx, + struct btrfs_path *path, + u64 ino, u64 data_offset, + u64 ino_size, + struct clone_root **found) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret; + int extent_type; + u64 logical; + u64 disk_byte; + u64 num_bytes; + struct btrfs_file_extent_item *fi; + struct extent_buffer *eb = path->nodes[0]; + struct backref_ctx backref_ctx = { 0 }; + struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 }; + struct clone_root *cur_clone_root; + int compressed; + u32 i; + + /* + * With fallocate we can get prealloc extents beyond the inode's i_size, + * so we don't do anything here because clone operations can not clone + * to a range beyond i_size without increasing the i_size of the + * destination inode. + */ + if (data_offset >= ino_size) + return 0; + + fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + return -ENOENT; + + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte == 0) + return -ENOENT; + + compressed = btrfs_file_extent_compression(eb, fi); + num_bytes = btrfs_file_extent_num_bytes(eb, fi); + logical = disk_byte + btrfs_file_extent_offset(eb, fi); + + /* + * Setup the clone roots. + */ + for (i = 0; i < sctx->clone_roots_cnt; i++) { + cur_clone_root = sctx->clone_roots + i; + cur_clone_root->ino = (u64)-1; + cur_clone_root->offset = 0; + cur_clone_root->num_bytes = 0; + cur_clone_root->found_ref = false; + } + + backref_ctx.sctx = sctx; + backref_ctx.cur_objectid = ino; + backref_ctx.cur_offset = data_offset; + backref_ctx.bytenr = disk_byte; + /* + * Use the header owner and not the send root's id, because in case of a + * snapshot we can have shared subtrees. + */ + backref_ctx.backref_owner = btrfs_header_owner(eb); + backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, fi); + + /* + * The last extent of a file may be too large due to page alignment. + * We need to adjust extent_len in this case so that the checks in + * iterate_backrefs() work. + */ + if (data_offset + num_bytes >= ino_size) + backref_ctx.extent_len = ino_size - data_offset; + else + backref_ctx.extent_len = num_bytes; + + /* + * Now collect all backrefs. + */ + backref_walk_ctx.bytenr = disk_byte; + if (compressed == BTRFS_COMPRESS_NONE) + backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, fi); + backref_walk_ctx.fs_info = fs_info; + backref_walk_ctx.cache_lookup = lookup_backref_cache; + backref_walk_ctx.cache_store = store_backref_cache; + backref_walk_ctx.indirect_ref_iterator = iterate_backrefs; + backref_walk_ctx.check_extent_item = check_extent_item; + backref_walk_ctx.user_ctx = &backref_ctx; + + /* + * If have a single clone root, then it's the send root and we can tell + * the backref walking code to skip our own backref and not resolve it, + * since we can not use it for cloning - the source and destination + * ranges can't overlap and in case the leaf is shared through a subtree + * due to snapshots, we can't use those other roots since they are not + * in the list of clone roots. + */ + if (sctx->clone_roots_cnt == 1) + backref_walk_ctx.skip_data_ref = skip_self_data_ref; + + ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs, + &backref_ctx); + if (ret < 0) + return ret; + + down_read(&fs_info->commit_root_sem); + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + /* + * A transaction commit for a transaction in which block group + * relocation was done just happened. + * The disk_bytenr of the file extent item we processed is + * possibly stale, referring to the extent's location before + * relocation. So act as if we haven't found any clone sources + * and fallback to write commands, which will read the correct + * data from the new extent location. Otherwise we will fail + * below because we haven't found our own back reference or we + * could be getting incorrect sources in case the old extent + * was already reallocated after the relocation. + */ + up_read(&fs_info->commit_root_sem); + return -ENOENT; + } + up_read(&fs_info->commit_root_sem); + + btrfs_debug(fs_info, + "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu", + data_offset, ino, num_bytes, logical); + + if (!backref_ctx.found) { + btrfs_debug(fs_info, "no clones found"); + return -ENOENT; + } + + cur_clone_root = NULL; + for (i = 0; i < sctx->clone_roots_cnt; i++) { + struct clone_root *clone_root = &sctx->clone_roots[i]; + + if (!clone_root->found_ref) + continue; + + /* + * Choose the root from which we can clone more bytes, to + * minimize write operations and therefore have more extent + * sharing at the destination (the same as in the source). + */ + if (!cur_clone_root || + clone_root->num_bytes > cur_clone_root->num_bytes) { + cur_clone_root = clone_root; + + /* + * We found an optimal clone candidate (any inode from + * any root is fine), so we're done. + */ + if (clone_root->num_bytes >= backref_ctx.extent_len) + break; + } + } + + if (cur_clone_root) { + *found = cur_clone_root; + ret = 0; + } else { + ret = -ENOENT; + } + + return ret; +} + +static int read_symlink(struct btrfs_root *root, + u64 ino, + struct fs_path *dest) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_file_extent_item *ei; + u8 type; + u8 compression; + unsigned long off; + int len; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret) { + /* + * An empty symlink inode. Can happen in rare error paths when + * creating a symlink (transaction committed before the inode + * eviction handler removed the symlink inode items and a crash + * happened in between or the subvol was snapshoted in between). + * Print an informative message to dmesg/syslog so that the user + * can delete the symlink. + */ + btrfs_err(root->fs_info, + "Found empty symlink inode %llu at root %llu", + ino, root->root_key.objectid); + ret = -EIO; + goto out; + } + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], ei); + if (unlikely(type != BTRFS_FILE_EXTENT_INLINE)) { + ret = -EUCLEAN; + btrfs_crit(root->fs_info, +"send: found symlink extent that is not inline, ino %llu root %llu extent type %d", + ino, btrfs_root_id(root), type); + goto out; + } + compression = btrfs_file_extent_compression(path->nodes[0], ei); + if (unlikely(compression != BTRFS_COMPRESS_NONE)) { + ret = -EUCLEAN; + btrfs_crit(root->fs_info, +"send: found symlink extent with compression, ino %llu root %llu compression type %d", + ino, btrfs_root_id(root), compression); + goto out; + } + + off = btrfs_file_extent_inline_start(ei); + len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); + + ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * Helper function to generate a file name that is unique in the root of + * send_root and parent_root. This is used to generate names for orphan inodes. + */ +static int gen_unique_name(struct send_ctx *sctx, + u64 ino, u64 gen, + struct fs_path *dest) +{ + int ret = 0; + struct btrfs_path *path; + struct btrfs_dir_item *di; + char tmp[64]; + int len; + u64 idx = 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + while (1) { + struct fscrypt_str tmp_name; + + len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", + ino, gen, idx); + ASSERT(len < sizeof(tmp)); + tmp_name.name = tmp; + tmp_name.len = strlen(tmp); + + di = btrfs_lookup_dir_item(NULL, sctx->send_root, + path, BTRFS_FIRST_FREE_OBJECTID, + &tmp_name, 0); + btrfs_release_path(path); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + if (di) { + /* not unique, try again */ + idx++; + continue; + } + + if (!sctx->parent_root) { + /* unique */ + ret = 0; + break; + } + + di = btrfs_lookup_dir_item(NULL, sctx->parent_root, + path, BTRFS_FIRST_FREE_OBJECTID, + &tmp_name, 0); + btrfs_release_path(path); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + if (di) { + /* not unique, try again */ + idx++; + continue; + } + /* unique */ + break; + } + + ret = fs_path_add(dest, tmp, strlen(tmp)); + +out: + btrfs_free_path(path); + return ret; +} + +enum inode_state { + inode_state_no_change, + inode_state_will_create, + inode_state_did_create, + inode_state_will_delete, + inode_state_did_delete, +}; + +static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, + u64 *send_gen, u64 *parent_gen) +{ + int ret; + int left_ret; + int right_ret; + u64 left_gen; + u64 right_gen = 0; + struct btrfs_inode_info info; + + ret = get_inode_info(sctx->send_root, ino, &info); + if (ret < 0 && ret != -ENOENT) + goto out; + left_ret = (info.nlink == 0) ? -ENOENT : ret; + left_gen = info.gen; + if (send_gen) + *send_gen = ((left_ret == -ENOENT) ? 0 : info.gen); + + if (!sctx->parent_root) { + right_ret = -ENOENT; + } else { + ret = get_inode_info(sctx->parent_root, ino, &info); + if (ret < 0 && ret != -ENOENT) + goto out; + right_ret = (info.nlink == 0) ? -ENOENT : ret; + right_gen = info.gen; + if (parent_gen) + *parent_gen = ((right_ret == -ENOENT) ? 0 : info.gen); + } + + if (!left_ret && !right_ret) { + if (left_gen == gen && right_gen == gen) { + ret = inode_state_no_change; + } else if (left_gen == gen) { + if (ino < sctx->send_progress) + ret = inode_state_did_create; + else + ret = inode_state_will_create; + } else if (right_gen == gen) { + if (ino < sctx->send_progress) + ret = inode_state_did_delete; + else + ret = inode_state_will_delete; + } else { + ret = -ENOENT; + } + } else if (!left_ret) { + if (left_gen == gen) { + if (ino < sctx->send_progress) + ret = inode_state_did_create; + else + ret = inode_state_will_create; + } else { + ret = -ENOENT; + } + } else if (!right_ret) { + if (right_gen == gen) { + if (ino < sctx->send_progress) + ret = inode_state_did_delete; + else + ret = inode_state_will_delete; + } else { + ret = -ENOENT; + } + } else { + ret = -ENOENT; + } + +out: + return ret; +} + +static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen, + u64 *send_gen, u64 *parent_gen) +{ + int ret; + + if (ino == BTRFS_FIRST_FREE_OBJECTID) + return 1; + + ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); + if (ret < 0) + goto out; + + if (ret == inode_state_no_change || + ret == inode_state_did_create || + ret == inode_state_will_delete) + ret = 1; + else + ret = 0; + +out: + return ret; +} + +/* + * Helper function to lookup a dir item in a dir. + */ +static int lookup_dir_item_inode(struct btrfs_root *root, + u64 dir, const char *name, int name_len, + u64 *found_inode) +{ + int ret = 0; + struct btrfs_dir_item *di; + struct btrfs_key key; + struct btrfs_path *path; + struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len); + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0); + if (IS_ERR_OR_NULL(di)) { + ret = di ? PTR_ERR(di) : -ENOENT; + goto out; + } + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); + if (key.type == BTRFS_ROOT_ITEM_KEY) { + ret = -ENOENT; + goto out; + } + *found_inode = key.objectid; + +out: + btrfs_free_path(path); + return ret; +} + +/* + * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, + * generation of the parent dir and the name of the dir entry. + */ +static int get_first_ref(struct btrfs_root *root, u64 ino, + u64 *dir, u64 *dir_gen, struct fs_path *name) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + int len; + u64 parent_dir; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); + if (ret < 0) + goto out; + if (!ret) + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (ret || found_key.objectid != ino || + (found_key.type != BTRFS_INODE_REF_KEY && + found_key.type != BTRFS_INODE_EXTREF_KEY)) { + ret = -ENOENT; + goto out; + } + + if (found_key.type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *iref; + iref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(path->nodes[0], iref); + ret = fs_path_add_from_extent_buffer(name, path->nodes[0], + (unsigned long)(iref + 1), + len); + parent_dir = found_key.offset; + } else { + struct btrfs_inode_extref *extref; + extref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_extref); + len = btrfs_inode_extref_name_len(path->nodes[0], extref); + ret = fs_path_add_from_extent_buffer(name, path->nodes[0], + (unsigned long)&extref->name, len); + parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref); + } + if (ret < 0) + goto out; + btrfs_release_path(path); + + if (dir_gen) { + ret = get_inode_gen(root, parent_dir, dir_gen); + if (ret < 0) + goto out; + } + + *dir = parent_dir; + +out: + btrfs_free_path(path); + return ret; +} + +static int is_first_ref(struct btrfs_root *root, + u64 ino, u64 dir, + const char *name, int name_len) +{ + int ret; + struct fs_path *tmp_name; + u64 tmp_dir; + + tmp_name = fs_path_alloc(); + if (!tmp_name) + return -ENOMEM; + + ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name); + if (ret < 0) + goto out; + + if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) { + ret = 0; + goto out; + } + + ret = !memcmp(tmp_name->start, name, name_len); + +out: + fs_path_free(tmp_name); + return ret; +} + +/* + * Used by process_recorded_refs to determine if a new ref would overwrite an + * already existing ref. In case it detects an overwrite, it returns the + * inode/gen in who_ino/who_gen. + * When an overwrite is detected, process_recorded_refs does proper orphanizing + * to make sure later references to the overwritten inode are possible. + * Orphanizing is however only required for the first ref of an inode. + * process_recorded_refs does an additional is_first_ref check to see if + * orphanizing is really required. + */ +static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, + const char *name, int name_len, + u64 *who_ino, u64 *who_gen, u64 *who_mode) +{ + int ret; + u64 parent_root_dir_gen; + u64 other_inode = 0; + struct btrfs_inode_info info; + + if (!sctx->parent_root) + return 0; + + ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen); + if (ret <= 0) + return 0; + + /* + * If we have a parent root we need to verify that the parent dir was + * not deleted and then re-created, if it was then we have no overwrite + * and we can just unlink this entry. + * + * @parent_root_dir_gen was set to 0 if the inode does not exist in the + * parent root. + */ + if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID && + parent_root_dir_gen != dir_gen) + return 0; + + ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, + &other_inode); + if (ret == -ENOENT) + return 0; + else if (ret < 0) + return ret; + + /* + * Check if the overwritten ref was already processed. If yes, the ref + * was already unlinked/moved, so we can safely assume that we will not + * overwrite anything at this point in time. + */ + if (other_inode > sctx->send_progress || + is_waiting_for_move(sctx, other_inode)) { + ret = get_inode_info(sctx->parent_root, other_inode, &info); + if (ret < 0) + return ret; + + *who_ino = other_inode; + *who_gen = info.gen; + *who_mode = info.mode; + return 1; + } + + return 0; +} + +/* + * Checks if the ref was overwritten by an already processed inode. This is + * used by __get_cur_name_and_parent to find out if the ref was orphanized and + * thus the orphan name needs be used. + * process_recorded_refs also uses it to avoid unlinking of refs that were + * overwritten. + */ +static int did_overwrite_ref(struct send_ctx *sctx, + u64 dir, u64 dir_gen, + u64 ino, u64 ino_gen, + const char *name, int name_len) +{ + int ret; + u64 ow_inode; + u64 ow_gen = 0; + u64 send_root_dir_gen; + + if (!sctx->parent_root) + return 0; + + ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL); + if (ret <= 0) + return ret; + + /* + * @send_root_dir_gen was set to 0 if the inode does not exist in the + * send root. + */ + if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen) + return 0; + + /* check if the ref was overwritten by another ref */ + ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, + &ow_inode); + if (ret == -ENOENT) { + /* was never and will never be overwritten */ + return 0; + } else if (ret < 0) { + return ret; + } + + if (ow_inode == ino) { + ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); + if (ret < 0) + return ret; + + /* It's the same inode, so no overwrite happened. */ + if (ow_gen == ino_gen) + return 0; + } + + /* + * We know that it is or will be overwritten. Check this now. + * The current inode being processed might have been the one that caused + * inode 'ino' to be orphanized, therefore check if ow_inode matches + * the current inode being processed. + */ + if (ow_inode < sctx->send_progress) + return 1; + + if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) { + if (ow_gen == 0) { + ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); + if (ret < 0) + return ret; + } + if (ow_gen == sctx->cur_inode_gen) + return 1; + } + + return 0; +} + +/* + * Same as did_overwrite_ref, but also checks if it is the first ref of an inode + * that got overwritten. This is used by process_recorded_refs to determine + * if it has to use the path as returned by get_cur_path or the orphan name. + */ +static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) +{ + int ret = 0; + struct fs_path *name = NULL; + u64 dir; + u64 dir_gen; + + if (!sctx->parent_root) + goto out; + + name = fs_path_alloc(); + if (!name) + return -ENOMEM; + + ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name); + if (ret < 0) + goto out; + + ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen, + name->start, fs_path_len(name)); + +out: + fs_path_free(name); + return ret; +} + +static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx, + u64 ino, u64 gen) +{ + struct btrfs_lru_cache_entry *entry; + + entry = btrfs_lru_cache_lookup(&sctx->name_cache, ino, gen); + if (!entry) + return NULL; + + return container_of(entry, struct name_cache_entry, entry); +} + +/* + * Used by get_cur_path for each ref up to the root. + * Returns 0 if it succeeded. + * Returns 1 if the inode is not existent or got overwritten. In that case, the + * name is an orphan name. This instructs get_cur_path to stop iterating. If 1 + * is returned, parent_ino/parent_gen are not guaranteed to be valid. + * Returns <0 in case of error. + */ +static int __get_cur_name_and_parent(struct send_ctx *sctx, + u64 ino, u64 gen, + u64 *parent_ino, + u64 *parent_gen, + struct fs_path *dest) +{ + int ret; + int nce_ret; + struct name_cache_entry *nce; + + /* + * First check if we already did a call to this function with the same + * ino/gen. If yes, check if the cache entry is still up-to-date. If yes + * return the cached result. + */ + nce = name_cache_search(sctx, ino, gen); + if (nce) { + if (ino < sctx->send_progress && nce->need_later_update) { + btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry); + nce = NULL; + } else { + *parent_ino = nce->parent_ino; + *parent_gen = nce->parent_gen; + ret = fs_path_add(dest, nce->name, nce->name_len); + if (ret < 0) + goto out; + ret = nce->ret; + goto out; + } + } + + /* + * If the inode is not existent yet, add the orphan name and return 1. + * This should only happen for the parent dir that we determine in + * record_new_ref_if_needed(). + */ + ret = is_inode_existent(sctx, ino, gen, NULL, NULL); + if (ret < 0) + goto out; + + if (!ret) { + ret = gen_unique_name(sctx, ino, gen, dest); + if (ret < 0) + goto out; + ret = 1; + goto out_cache; + } + + /* + * Depending on whether the inode was already processed or not, use + * send_root or parent_root for ref lookup. + */ + if (ino < sctx->send_progress) + ret = get_first_ref(sctx->send_root, ino, + parent_ino, parent_gen, dest); + else + ret = get_first_ref(sctx->parent_root, ino, + parent_ino, parent_gen, dest); + if (ret < 0) + goto out; + + /* + * Check if the ref was overwritten by an inode's ref that was processed + * earlier. If yes, treat as orphan and return 1. + */ + ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, + dest->start, dest->end - dest->start); + if (ret < 0) + goto out; + if (ret) { + fs_path_reset(dest); + ret = gen_unique_name(sctx, ino, gen, dest); + if (ret < 0) + goto out; + ret = 1; + } + +out_cache: + /* + * Store the result of the lookup in the name cache. + */ + nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL); + if (!nce) { + ret = -ENOMEM; + goto out; + } + + nce->entry.key = ino; + nce->entry.gen = gen; + nce->parent_ino = *parent_ino; + nce->parent_gen = *parent_gen; + nce->name_len = fs_path_len(dest); + nce->ret = ret; + strcpy(nce->name, dest->start); + + if (ino < sctx->send_progress) + nce->need_later_update = 0; + else + nce->need_later_update = 1; + + nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL); + if (nce_ret < 0) { + kfree(nce); + ret = nce_ret; + } + +out: + return ret; +} + +/* + * Magic happens here. This function returns the first ref to an inode as it + * would look like while receiving the stream at this point in time. + * We walk the path up to the root. For every inode in between, we check if it + * was already processed/sent. If yes, we continue with the parent as found + * in send_root. If not, we continue with the parent as found in parent_root. + * If we encounter an inode that was deleted at this point in time, we use the + * inodes "orphan" name instead of the real name and stop. Same with new inodes + * that were not created yet and overwritten inodes/refs. + * + * When do we have orphan inodes: + * 1. When an inode is freshly created and thus no valid refs are available yet + * 2. When a directory lost all it's refs (deleted) but still has dir items + * inside which were not processed yet (pending for move/delete). If anyone + * tried to get the path to the dir items, it would get a path inside that + * orphan directory. + * 3. When an inode is moved around or gets new links, it may overwrite the ref + * of an unprocessed inode. If in that case the first ref would be + * overwritten, the overwritten inode gets "orphanized". Later when we + * process this overwritten inode, it is restored at a new place by moving + * the orphan inode. + * + * sctx->send_progress tells this function at which point in time receiving + * would be. + */ +static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, + struct fs_path *dest) +{ + int ret = 0; + struct fs_path *name = NULL; + u64 parent_inode = 0; + u64 parent_gen = 0; + int stop = 0; + + name = fs_path_alloc(); + if (!name) { + ret = -ENOMEM; + goto out; + } + + dest->reversed = 1; + fs_path_reset(dest); + + while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { + struct waiting_dir_move *wdm; + + fs_path_reset(name); + + if (is_waiting_for_rm(sctx, ino, gen)) { + ret = gen_unique_name(sctx, ino, gen, name); + if (ret < 0) + goto out; + ret = fs_path_add_path(dest, name); + break; + } + + wdm = get_waiting_dir_move(sctx, ino); + if (wdm && wdm->orphanized) { + ret = gen_unique_name(sctx, ino, gen, name); + stop = 1; + } else if (wdm) { + ret = get_first_ref(sctx->parent_root, ino, + &parent_inode, &parent_gen, name); + } else { + ret = __get_cur_name_and_parent(sctx, ino, gen, + &parent_inode, + &parent_gen, name); + if (ret) + stop = 1; + } + + if (ret < 0) + goto out; + + ret = fs_path_add_path(dest, name); + if (ret < 0) + goto out; + + ino = parent_inode; + gen = parent_gen; + } + +out: + fs_path_free(name); + if (!ret) + fs_path_unreverse(dest); + return ret; +} + +/* + * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace + */ +static int send_subvol_begin(struct send_ctx *sctx) +{ + int ret; + struct btrfs_root *send_root = sctx->send_root; + struct btrfs_root *parent_root = sctx->parent_root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + char *name = NULL; + int namelen; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); + if (!name) { + btrfs_free_path(path); + return -ENOMEM; + } + + key.objectid = send_root->root_key.objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root, + &key, path, 1, 0); + if (ret < 0) + goto out; + if (ret) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.type != BTRFS_ROOT_BACKREF_KEY || + key.objectid != send_root->root_key.objectid) { + ret = -ENOENT; + goto out; + } + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); + namelen = btrfs_root_ref_name_len(leaf, ref); + read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen); + btrfs_release_path(path); + + if (parent_root) { + ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT); + if (ret < 0) + goto out; + } else { + ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL); + if (ret < 0) + goto out; + } + + TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); + + if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, + sctx->send_root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, + sctx->send_root->root_item.uuid); + + TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, + btrfs_root_ctransid(&sctx->send_root->root_item)); + if (parent_root) { + if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + parent_root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + parent_root->root_item.uuid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, + btrfs_root_ctransid(&sctx->parent_root->root_item)); + } + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + btrfs_free_path(path); + kfree(name); + return ret; +} + +static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret = 0; + struct fs_path *p; + + btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret = 0; + struct fs_path *p; + + btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret = 0; + struct fs_path *p; + + if (sctx->proto < 2) + return 0; + + btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret = 0; + struct fs_path *p; + + btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu", + ino, uid, gid); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret = 0; + struct fs_path *p = NULL; + struct btrfs_inode_item *ii; + struct btrfs_path *path = NULL; + struct extent_buffer *eb; + struct btrfs_key key; + int slot; + + btrfs_debug(fs_info, "send_utimes %llu", ino); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + path = alloc_path_for_send(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + goto out; + + eb = path->nodes[0]; + slot = path->slots[0]; + ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); + + ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime); + if (sctx->proto >= 2) + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + btrfs_free_path(path); + return ret; +} + +/* + * If the cache is full, we can't remove entries from it and do a call to + * send_utimes() for each respective inode, because we might be finishing + * processing an inode that is a directory and it just got renamed, and existing + * entries in the cache may refer to inodes that have the directory in their + * full path - in which case we would generate outdated paths (pre-rename) + * for the inodes that the cache entries point to. Instead of prunning the + * cache when inserting, do it after we finish processing each inode at + * finish_inode_if_needed(). + */ +static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen) +{ + struct btrfs_lru_cache_entry *entry; + int ret; + + entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen); + if (entry != NULL) + return 0; + + /* Caching is optional, don't fail if we can't allocate memory. */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return send_utimes(sctx, dir, gen); + + entry->key = dir; + entry->gen = gen; + + ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL); + ASSERT(ret != -EEXIST); + if (ret) { + kfree(entry); + return send_utimes(sctx, dir, gen); + } + + return 0; +} + +static int trim_dir_utimes_cache(struct send_ctx *sctx) +{ + while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) > + SEND_MAX_DIR_UTIMES_CACHE_SIZE) { + struct btrfs_lru_cache_entry *lru; + int ret; + + lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache); + ASSERT(lru != NULL); + + ret = send_utimes(sctx, lru->key, lru->gen); + if (ret) + return ret; + + btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru); + } + + return 0; +} + +/* + * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have + * a valid path yet because we did not process the refs yet. So, the inode + * is created as orphan. + */ +static int send_create_inode(struct send_ctx *sctx, u64 ino) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret = 0; + struct fs_path *p; + int cmd; + struct btrfs_inode_info info; + u64 gen; + u64 mode; + u64 rdev; + + btrfs_debug(fs_info, "send_create_inode %llu", ino); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + if (ino != sctx->cur_ino) { + ret = get_inode_info(sctx->send_root, ino, &info); + if (ret < 0) + goto out; + gen = info.gen; + mode = info.mode; + rdev = info.rdev; + } else { + gen = sctx->cur_inode_gen; + mode = sctx->cur_inode_mode; + rdev = sctx->cur_inode_rdev; + } + + if (S_ISREG(mode)) { + cmd = BTRFS_SEND_C_MKFILE; + } else if (S_ISDIR(mode)) { + cmd = BTRFS_SEND_C_MKDIR; + } else if (S_ISLNK(mode)) { + cmd = BTRFS_SEND_C_SYMLINK; + } else if (S_ISCHR(mode) || S_ISBLK(mode)) { + cmd = BTRFS_SEND_C_MKNOD; + } else if (S_ISFIFO(mode)) { + cmd = BTRFS_SEND_C_MKFIFO; + } else if (S_ISSOCK(mode)) { + cmd = BTRFS_SEND_C_MKSOCK; + } else { + btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o", + (int)(mode & S_IFMT)); + ret = -EOPNOTSUPP; + goto out; + } + + ret = begin_cmd(sctx, cmd); + if (ret < 0) + goto out; + + ret = gen_unique_name(sctx, ino, gen, p); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino); + + if (S_ISLNK(mode)) { + fs_path_reset(p); + ret = read_symlink(sctx->send_root, ino, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); + } else if (S_ISCHR(mode) || S_ISBLK(mode) || + S_ISFIFO(mode) || S_ISSOCK(mode)) { + TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode); + } + + ret = send_cmd(sctx); + if (ret < 0) + goto out; + + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +static void cache_dir_created(struct send_ctx *sctx, u64 dir) +{ + struct btrfs_lru_cache_entry *entry; + int ret; + + /* Caching is optional, ignore any failures. */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return; + + entry->key = dir; + entry->gen = 0; + ret = btrfs_lru_cache_store(&sctx->dir_created_cache, entry, GFP_KERNEL); + if (ret < 0) + kfree(entry); +} + +/* + * We need some special handling for inodes that get processed before the parent + * directory got created. See process_recorded_refs for details. + * This function does the check if we already created the dir out of order. + */ +static int did_create_dir(struct send_ctx *sctx, u64 dir) +{ + int ret = 0; + int iter_ret = 0; + struct btrfs_path *path = NULL; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_key di_key; + struct btrfs_dir_item *di; + + if (btrfs_lru_cache_lookup(&sctx->dir_created_cache, dir, 0)) + return 1; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = 0; + + btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) { + struct extent_buffer *eb = path->nodes[0]; + + if (found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + break; + } + + di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(eb, di, &di_key); + + if (di_key.type != BTRFS_ROOT_ITEM_KEY && + di_key.objectid < sctx->send_progress) { + ret = 1; + cache_dir_created(sctx, dir); + break; + } + } + /* Catch error found during iteration */ + if (iter_ret < 0) + ret = iter_ret; + + btrfs_free_path(path); + return ret; +} + +/* + * Only creates the inode if it is: + * 1. Not a directory + * 2. Or a directory which was not created already due to out of order + * directories. See did_create_dir and process_recorded_refs for details. + */ +static int send_create_inode_if_needed(struct send_ctx *sctx) +{ + int ret; + + if (S_ISDIR(sctx->cur_inode_mode)) { + ret = did_create_dir(sctx, sctx->cur_ino); + if (ret < 0) + return ret; + else if (ret > 0) + return 0; + } + + ret = send_create_inode(sctx, sctx->cur_ino); + + if (ret == 0 && S_ISDIR(sctx->cur_inode_mode)) + cache_dir_created(sctx, sctx->cur_ino); + + return ret; +} + +struct recorded_ref { + struct list_head list; + char *name; + struct fs_path *full_path; + u64 dir; + u64 dir_gen; + int name_len; + struct rb_node node; + struct rb_root *root; +}; + +static struct recorded_ref *recorded_ref_alloc(void) +{ + struct recorded_ref *ref; + + ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (!ref) + return NULL; + RB_CLEAR_NODE(&ref->node); + INIT_LIST_HEAD(&ref->list); + return ref; +} + +static void recorded_ref_free(struct recorded_ref *ref) +{ + if (!ref) + return; + if (!RB_EMPTY_NODE(&ref->node)) + rb_erase(&ref->node, ref->root); + list_del(&ref->list); + fs_path_free(ref->full_path); + kfree(ref); +} + +static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) +{ + ref->full_path = path; + ref->name = (char *)kbasename(ref->full_path->start); + ref->name_len = ref->full_path->end - ref->name; +} + +static int dup_ref(struct recorded_ref *ref, struct list_head *list) +{ + struct recorded_ref *new; + + new = recorded_ref_alloc(); + if (!new) + return -ENOMEM; + + new->dir = ref->dir; + new->dir_gen = ref->dir_gen; + list_add_tail(&new->list, list); + return 0; +} + +static void __free_recorded_refs(struct list_head *head) +{ + struct recorded_ref *cur; + + while (!list_empty(head)) { + cur = list_entry(head->next, struct recorded_ref, list); + recorded_ref_free(cur); + } +} + +static void free_recorded_refs(struct send_ctx *sctx) +{ + __free_recorded_refs(&sctx->new_refs); + __free_recorded_refs(&sctx->deleted_refs); +} + +/* + * Renames/moves a file/dir to its orphan name. Used when the first + * ref of an unprocessed inode gets overwritten and for all non empty + * directories. + */ +static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, + struct fs_path *path) +{ + int ret; + struct fs_path *orphan; + + orphan = fs_path_alloc(); + if (!orphan) + return -ENOMEM; + + ret = gen_unique_name(sctx, ino, gen, orphan); + if (ret < 0) + goto out; + + ret = send_rename(sctx, path, orphan); + +out: + fs_path_free(orphan); + return ret; +} + +static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx, + u64 dir_ino, u64 dir_gen) +{ + struct rb_node **p = &sctx->orphan_dirs.rb_node; + struct rb_node *parent = NULL; + struct orphan_dir_info *entry, *odi; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct orphan_dir_info, node); + if (dir_ino < entry->ino) + p = &(*p)->rb_left; + else if (dir_ino > entry->ino) + p = &(*p)->rb_right; + else if (dir_gen < entry->gen) + p = &(*p)->rb_left; + else if (dir_gen > entry->gen) + p = &(*p)->rb_right; + else + return entry; + } + + odi = kmalloc(sizeof(*odi), GFP_KERNEL); + if (!odi) + return ERR_PTR(-ENOMEM); + odi->ino = dir_ino; + odi->gen = dir_gen; + odi->last_dir_index_offset = 0; + odi->dir_high_seq_ino = 0; + + rb_link_node(&odi->node, parent, p); + rb_insert_color(&odi->node, &sctx->orphan_dirs); + return odi; +} + +static struct orphan_dir_info *get_orphan_dir_info(struct send_ctx *sctx, + u64 dir_ino, u64 gen) +{ + struct rb_node *n = sctx->orphan_dirs.rb_node; + struct orphan_dir_info *entry; + + while (n) { + entry = rb_entry(n, struct orphan_dir_info, node); + if (dir_ino < entry->ino) + n = n->rb_left; + else if (dir_ino > entry->ino) + n = n->rb_right; + else if (gen < entry->gen) + n = n->rb_left; + else if (gen > entry->gen) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen) +{ + struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen); + + return odi != NULL; +} + +static void free_orphan_dir_info(struct send_ctx *sctx, + struct orphan_dir_info *odi) +{ + if (!odi) + return; + rb_erase(&odi->node, &sctx->orphan_dirs); + kfree(odi); +} + +/* + * Returns 1 if a directory can be removed at this point in time. + * We check this by iterating all dir items and checking if the inode behind + * the dir item was already processed. + */ +static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen) +{ + int ret = 0; + int iter_ret = 0; + struct btrfs_root *root = sctx->parent_root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_key loc; + struct btrfs_dir_item *di; + struct orphan_dir_info *odi = NULL; + u64 dir_high_seq_ino = 0; + u64 last_dir_index_offset = 0; + + /* + * Don't try to rmdir the top/root subvolume dir. + */ + if (dir == BTRFS_FIRST_FREE_OBJECTID) + return 0; + + odi = get_orphan_dir_info(sctx, dir, dir_gen); + if (odi && sctx->cur_ino < odi->dir_high_seq_ino) + return 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + if (!odi) { + /* + * Find the inode number associated with the last dir index + * entry. This is very likely the inode with the highest number + * of all inodes that have an entry in the directory. We can + * then use it to avoid future calls to can_rmdir(), when + * processing inodes with a lower number, from having to search + * the parent root b+tree for dir index keys. + */ + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + /* Can't happen, the root is never empty. */ + ASSERT(path->slots[0] > 0); + if (WARN_ON(path->slots[0] == 0)) { + ret = -EUCLEAN; + goto out; + } + path->slots[0]--; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != dir || key.type != BTRFS_DIR_INDEX_KEY) { + /* No index keys, dir can be removed. */ + ret = 1; + goto out; + } + + di = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + dir_high_seq_ino = loc.objectid; + if (sctx->cur_ino < dir_high_seq_ino) { + ret = 0; + goto out; + } + + btrfs_release_path(path); + } + + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = (odi ? odi->last_dir_index_offset : 0); + + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { + struct waiting_dir_move *dm; + + if (found_key.objectid != key.objectid || + found_key.type != key.type) + break; + + di = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + + dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid); + last_dir_index_offset = found_key.offset; + + dm = get_waiting_dir_move(sctx, loc.objectid); + if (dm) { + dm->rmdir_ino = dir; + dm->rmdir_gen = dir_gen; + ret = 0; + goto out; + } + + if (loc.objectid > sctx->cur_ino) { + ret = 0; + goto out; + } + } + if (iter_ret < 0) { + ret = iter_ret; + goto out; + } + free_orphan_dir_info(sctx, odi); + + ret = 1; + +out: + btrfs_free_path(path); + + if (ret) + return ret; + + if (!odi) { + odi = add_orphan_dir_info(sctx, dir, dir_gen); + if (IS_ERR(odi)) + return PTR_ERR(odi); + + odi->gen = dir_gen; + } + + odi->last_dir_index_offset = last_dir_index_offset; + odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino); + + return 0; +} + +static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) +{ + struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino); + + return entry != NULL; +} + +static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized) +{ + struct rb_node **p = &sctx->waiting_dir_moves.rb_node; + struct rb_node *parent = NULL; + struct waiting_dir_move *entry, *dm; + + dm = kmalloc(sizeof(*dm), GFP_KERNEL); + if (!dm) + return -ENOMEM; + dm->ino = ino; + dm->rmdir_ino = 0; + dm->rmdir_gen = 0; + dm->orphanized = orphanized; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct waiting_dir_move, node); + if (ino < entry->ino) { + p = &(*p)->rb_left; + } else if (ino > entry->ino) { + p = &(*p)->rb_right; + } else { + kfree(dm); + return -EEXIST; + } + } + + rb_link_node(&dm->node, parent, p); + rb_insert_color(&dm->node, &sctx->waiting_dir_moves); + return 0; +} + +static struct waiting_dir_move * +get_waiting_dir_move(struct send_ctx *sctx, u64 ino) +{ + struct rb_node *n = sctx->waiting_dir_moves.rb_node; + struct waiting_dir_move *entry; + + while (n) { + entry = rb_entry(n, struct waiting_dir_move, node); + if (ino < entry->ino) + n = n->rb_left; + else if (ino > entry->ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static void free_waiting_dir_move(struct send_ctx *sctx, + struct waiting_dir_move *dm) +{ + if (!dm) + return; + rb_erase(&dm->node, &sctx->waiting_dir_moves); + kfree(dm); +} + +static int add_pending_dir_move(struct send_ctx *sctx, + u64 ino, + u64 ino_gen, + u64 parent_ino, + struct list_head *new_refs, + struct list_head *deleted_refs, + const bool is_orphan) +{ + struct rb_node **p = &sctx->pending_dir_moves.rb_node; + struct rb_node *parent = NULL; + struct pending_dir_move *entry = NULL, *pm; + struct recorded_ref *cur; + int exists = 0; + int ret; + + pm = kmalloc(sizeof(*pm), GFP_KERNEL); + if (!pm) + return -ENOMEM; + pm->parent_ino = parent_ino; + pm->ino = ino; + pm->gen = ino_gen; + INIT_LIST_HEAD(&pm->list); + INIT_LIST_HEAD(&pm->update_refs); + RB_CLEAR_NODE(&pm->node); + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct pending_dir_move, node); + if (parent_ino < entry->parent_ino) { + p = &(*p)->rb_left; + } else if (parent_ino > entry->parent_ino) { + p = &(*p)->rb_right; + } else { + exists = 1; + break; + } + } + + list_for_each_entry(cur, deleted_refs, list) { + ret = dup_ref(cur, &pm->update_refs); + if (ret < 0) + goto out; + } + list_for_each_entry(cur, new_refs, list) { + ret = dup_ref(cur, &pm->update_refs); + if (ret < 0) + goto out; + } + + ret = add_waiting_dir_move(sctx, pm->ino, is_orphan); + if (ret) + goto out; + + if (exists) { + list_add_tail(&pm->list, &entry->list); + } else { + rb_link_node(&pm->node, parent, p); + rb_insert_color(&pm->node, &sctx->pending_dir_moves); + } + ret = 0; +out: + if (ret) { + __free_recorded_refs(&pm->update_refs); + kfree(pm); + } + return ret; +} + +static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, + u64 parent_ino) +{ + struct rb_node *n = sctx->pending_dir_moves.rb_node; + struct pending_dir_move *entry; + + while (n) { + entry = rb_entry(n, struct pending_dir_move, node); + if (parent_ino < entry->parent_ino) + n = n->rb_left; + else if (parent_ino > entry->parent_ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static int path_loop(struct send_ctx *sctx, struct fs_path *name, + u64 ino, u64 gen, u64 *ancestor_ino) +{ + int ret = 0; + u64 parent_inode = 0; + u64 parent_gen = 0; + u64 start_ino = ino; + + *ancestor_ino = 0; + while (ino != BTRFS_FIRST_FREE_OBJECTID) { + fs_path_reset(name); + + if (is_waiting_for_rm(sctx, ino, gen)) + break; + if (is_waiting_for_move(sctx, ino)) { + if (*ancestor_ino == 0) + *ancestor_ino = ino; + ret = get_first_ref(sctx->parent_root, ino, + &parent_inode, &parent_gen, name); + } else { + ret = __get_cur_name_and_parent(sctx, ino, gen, + &parent_inode, + &parent_gen, name); + if (ret > 0) { + ret = 0; + break; + } + } + if (ret < 0) + break; + if (parent_inode == start_ino) { + ret = 1; + if (*ancestor_ino == 0) + *ancestor_ino = ino; + break; + } + ino = parent_inode; + gen = parent_gen; + } + return ret; +} + +static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) +{ + struct fs_path *from_path = NULL; + struct fs_path *to_path = NULL; + struct fs_path *name = NULL; + u64 orig_progress = sctx->send_progress; + struct recorded_ref *cur; + u64 parent_ino, parent_gen; + struct waiting_dir_move *dm = NULL; + u64 rmdir_ino = 0; + u64 rmdir_gen; + u64 ancestor; + bool is_orphan; + int ret; + + name = fs_path_alloc(); + from_path = fs_path_alloc(); + if (!name || !from_path) { + ret = -ENOMEM; + goto out; + } + + dm = get_waiting_dir_move(sctx, pm->ino); + ASSERT(dm); + rmdir_ino = dm->rmdir_ino; + rmdir_gen = dm->rmdir_gen; + is_orphan = dm->orphanized; + free_waiting_dir_move(sctx, dm); + + if (is_orphan) { + ret = gen_unique_name(sctx, pm->ino, + pm->gen, from_path); + } else { + ret = get_first_ref(sctx->parent_root, pm->ino, + &parent_ino, &parent_gen, name); + if (ret < 0) + goto out; + ret = get_cur_path(sctx, parent_ino, parent_gen, + from_path); + if (ret < 0) + goto out; + ret = fs_path_add_path(from_path, name); + } + if (ret < 0) + goto out; + + sctx->send_progress = sctx->cur_ino + 1; + ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); + if (ret < 0) + goto out; + if (ret) { + LIST_HEAD(deleted_refs); + ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); + ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, + &pm->update_refs, &deleted_refs, + is_orphan); + if (ret < 0) + goto out; + if (rmdir_ino) { + dm = get_waiting_dir_move(sctx, pm->ino); + ASSERT(dm); + dm->rmdir_ino = rmdir_ino; + dm->rmdir_gen = rmdir_gen; + } + goto out; + } + fs_path_reset(name); + to_path = name; + name = NULL; + ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); + if (ret < 0) + goto out; + + ret = send_rename(sctx, from_path, to_path); + if (ret < 0) + goto out; + + if (rmdir_ino) { + struct orphan_dir_info *odi; + u64 gen; + + odi = get_orphan_dir_info(sctx, rmdir_ino, rmdir_gen); + if (!odi) { + /* already deleted */ + goto finish; + } + gen = odi->gen; + + ret = can_rmdir(sctx, rmdir_ino, gen); + if (ret < 0) + goto out; + if (!ret) + goto finish; + + name = fs_path_alloc(); + if (!name) { + ret = -ENOMEM; + goto out; + } + ret = get_cur_path(sctx, rmdir_ino, gen, name); + if (ret < 0) + goto out; + ret = send_rmdir(sctx, name); + if (ret < 0) + goto out; + } + +finish: + ret = cache_dir_utimes(sctx, pm->ino, pm->gen); + if (ret < 0) + goto out; + + /* + * After rename/move, need to update the utimes of both new parent(s) + * and old parent(s). + */ + list_for_each_entry(cur, &pm->update_refs, list) { + /* + * The parent inode might have been deleted in the send snapshot + */ + ret = get_inode_info(sctx->send_root, cur->dir, NULL); + if (ret == -ENOENT) { + ret = 0; + continue; + } + if (ret < 0) + goto out; + + ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + } + +out: + fs_path_free(name); + fs_path_free(from_path); + fs_path_free(to_path); + sctx->send_progress = orig_progress; + + return ret; +} + +static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m) +{ + if (!list_empty(&m->list)) + list_del(&m->list); + if (!RB_EMPTY_NODE(&m->node)) + rb_erase(&m->node, &sctx->pending_dir_moves); + __free_recorded_refs(&m->update_refs); + kfree(m); +} + +static void tail_append_pending_moves(struct send_ctx *sctx, + struct pending_dir_move *moves, + struct list_head *stack) +{ + if (list_empty(&moves->list)) { + list_add_tail(&moves->list, stack); + } else { + LIST_HEAD(list); + list_splice_init(&moves->list, &list); + list_add_tail(&moves->list, stack); + list_splice_tail(&list, stack); + } + if (!RB_EMPTY_NODE(&moves->node)) { + rb_erase(&moves->node, &sctx->pending_dir_moves); + RB_CLEAR_NODE(&moves->node); + } +} + +static int apply_children_dir_moves(struct send_ctx *sctx) +{ + struct pending_dir_move *pm; + LIST_HEAD(stack); + u64 parent_ino = sctx->cur_ino; + int ret = 0; + + pm = get_pending_dir_moves(sctx, parent_ino); + if (!pm) + return 0; + + tail_append_pending_moves(sctx, pm, &stack); + + while (!list_empty(&stack)) { + pm = list_first_entry(&stack, struct pending_dir_move, list); + parent_ino = pm->ino; + ret = apply_dir_move(sctx, pm); + free_pending_move(sctx, pm); + if (ret) + goto out; + pm = get_pending_dir_moves(sctx, parent_ino); + if (pm) + tail_append_pending_moves(sctx, pm, &stack); + } + return 0; + +out: + while (!list_empty(&stack)) { + pm = list_first_entry(&stack, struct pending_dir_move, list); + free_pending_move(sctx, pm); + } + return ret; +} + +/* + * We might need to delay a directory rename even when no ancestor directory + * (in the send root) with a higher inode number than ours (sctx->cur_ino) was + * renamed. This happens when we rename a directory to the old name (the name + * in the parent root) of some other unrelated directory that got its rename + * delayed due to some ancestor with higher number that got renamed. + * + * Example: + * + * Parent snapshot: + * . (ino 256) + * |---- a/ (ino 257) + * | |---- file (ino 260) + * | + * |---- b/ (ino 258) + * |---- c/ (ino 259) + * + * Send snapshot: + * . (ino 256) + * |---- a/ (ino 258) + * |---- x/ (ino 259) + * |---- y/ (ino 257) + * |----- file (ino 260) + * + * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257 + * from 'a' to 'x/y' happening first, which in turn depends on the rename of + * inode 259 from 'c' to 'x'. So the order of rename commands the send stream + * must issue is: + * + * 1 - rename 259 from 'c' to 'x' + * 2 - rename 257 from 'a' to 'x/y' + * 3 - rename 258 from 'b' to 'a' + * + * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can + * be done right away and < 0 on error. + */ +static int wait_for_dest_dir_move(struct send_ctx *sctx, + struct recorded_ref *parent_ref, + const bool is_orphan) +{ + struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key di_key; + struct btrfs_dir_item *di; + u64 left_gen; + u64 right_gen; + int ret = 0; + struct waiting_dir_move *wdm; + + if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) + return 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = parent_ref->dir; + key.type = BTRFS_DIR_ITEM_KEY; + key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len); + + ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + goto out; + } + + di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name, + parent_ref->name_len); + if (!di) { + ret = 0; + goto out; + } + /* + * di_key.objectid has the number of the inode that has a dentry in the + * parent directory with the same name that sctx->cur_ino is being + * renamed to. We need to check if that inode is in the send root as + * well and if it is currently marked as an inode with a pending rename, + * if it is, we need to delay the rename of sctx->cur_ino as well, so + * that it happens after that other inode is renamed. + */ + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key); + if (di_key.type != BTRFS_INODE_ITEM_KEY) { + ret = 0; + goto out; + } + + ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen); + if (ret < 0) + goto out; + ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; + } + + /* Different inode, no need to delay the rename of sctx->cur_ino */ + if (right_gen != left_gen) { + ret = 0; + goto out; + } + + wdm = get_waiting_dir_move(sctx, di_key.objectid); + if (wdm && !wdm->orphanized) { + ret = add_pending_dir_move(sctx, + sctx->cur_ino, + sctx->cur_inode_gen, + di_key.objectid, + &sctx->new_refs, + &sctx->deleted_refs, + is_orphan); + if (!ret) + ret = 1; + } +out: + btrfs_free_path(path); + return ret; +} + +/* + * Check if inode ino2, or any of its ancestors, is inode ino1. + * Return 1 if true, 0 if false and < 0 on error. + */ +static int check_ino_in_path(struct btrfs_root *root, + const u64 ino1, + const u64 ino1_gen, + const u64 ino2, + const u64 ino2_gen, + struct fs_path *fs_path) +{ + u64 ino = ino2; + + if (ino1 == ino2) + return ino1_gen == ino2_gen; + + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + u64 parent; + u64 parent_gen; + int ret; + + fs_path_reset(fs_path); + ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); + if (ret < 0) + return ret; + if (parent == ino1) + return parent_gen == ino1_gen; + ino = parent; + } + return 0; +} + +/* + * Check if inode ino1 is an ancestor of inode ino2 in the given root for any + * possible path (in case ino2 is not a directory and has multiple hard links). + * Return 1 if true, 0 if false and < 0 on error. + */ +static int is_ancestor(struct btrfs_root *root, + const u64 ino1, + const u64 ino1_gen, + const u64 ino2, + struct fs_path *fs_path) +{ + bool free_fs_path = false; + int ret = 0; + int iter_ret = 0; + struct btrfs_path *path = NULL; + struct btrfs_key key; + + if (!fs_path) { + fs_path = fs_path_alloc(); + if (!fs_path) + return -ENOMEM; + free_fs_path = true; + } + + path = alloc_path_for_send(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = ino2; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + + btrfs_for_each_slot(root, &key, &key, path, iter_ret) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + u32 cur_offset = 0; + u32 item_size; + + if (key.objectid != ino2) + break; + if (key.type != BTRFS_INODE_REF_KEY && + key.type != BTRFS_INODE_EXTREF_KEY) + break; + + item_size = btrfs_item_size(leaf, slot); + while (cur_offset < item_size) { + u64 parent; + u64 parent_gen; + + if (key.type == BTRFS_INODE_EXTREF_KEY) { + unsigned long ptr; + struct btrfs_inode_extref *extref; + + ptr = btrfs_item_ptr_offset(leaf, slot); + extref = (struct btrfs_inode_extref *) + (ptr + cur_offset); + parent = btrfs_inode_extref_parent(leaf, + extref); + cur_offset += sizeof(*extref); + cur_offset += btrfs_inode_extref_name_len(leaf, + extref); + } else { + parent = key.offset; + cur_offset = item_size; + } + + ret = get_inode_gen(root, parent, &parent_gen); + if (ret < 0) + goto out; + ret = check_ino_in_path(root, ino1, ino1_gen, + parent, parent_gen, fs_path); + if (ret) + goto out; + } + } + ret = 0; + if (iter_ret < 0) + ret = iter_ret; + +out: + btrfs_free_path(path); + if (free_fs_path) + fs_path_free(fs_path); + return ret; +} + +static int wait_for_parent_move(struct send_ctx *sctx, + struct recorded_ref *parent_ref, + const bool is_orphan) +{ + int ret = 0; + u64 ino = parent_ref->dir; + u64 ino_gen = parent_ref->dir_gen; + u64 parent_ino_before, parent_ino_after; + struct fs_path *path_before = NULL; + struct fs_path *path_after = NULL; + int len1, len2; + + path_after = fs_path_alloc(); + path_before = fs_path_alloc(); + if (!path_after || !path_before) { + ret = -ENOMEM; + goto out; + } + + /* + * Our current directory inode may not yet be renamed/moved because some + * ancestor (immediate or not) has to be renamed/moved first. So find if + * such ancestor exists and make sure our own rename/move happens after + * that ancestor is processed to avoid path build infinite loops (done + * at get_cur_path()). + */ + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + u64 parent_ino_after_gen; + + if (is_waiting_for_move(sctx, ino)) { + /* + * If the current inode is an ancestor of ino in the + * parent root, we need to delay the rename of the + * current inode, otherwise don't delayed the rename + * because we can end up with a circular dependency + * of renames, resulting in some directories never + * getting the respective rename operations issued in + * the send stream or getting into infinite path build + * loops. + */ + ret = is_ancestor(sctx->parent_root, + sctx->cur_ino, sctx->cur_inode_gen, + ino, path_before); + if (ret) + break; + } + + fs_path_reset(path_before); + fs_path_reset(path_after); + + ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, + &parent_ino_after_gen, path_after); + if (ret < 0) + goto out; + ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, + NULL, path_before); + if (ret < 0 && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + ret = 0; + break; + } + + len1 = fs_path_len(path_before); + len2 = fs_path_len(path_after); + if (ino > sctx->cur_ino && + (parent_ino_before != parent_ino_after || len1 != len2 || + memcmp(path_before->start, path_after->start, len1))) { + u64 parent_ino_gen; + + ret = get_inode_gen(sctx->parent_root, ino, &parent_ino_gen); + if (ret < 0) + goto out; + if (ino_gen == parent_ino_gen) { + ret = 1; + break; + } + } + ino = parent_ino_after; + ino_gen = parent_ino_after_gen; + } + +out: + fs_path_free(path_before); + fs_path_free(path_after); + + if (ret == 1) { + ret = add_pending_dir_move(sctx, + sctx->cur_ino, + sctx->cur_inode_gen, + ino, + &sctx->new_refs, + &sctx->deleted_refs, + is_orphan); + if (!ret) + ret = 1; + } + + return ret; +} + +static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) +{ + int ret; + struct fs_path *new_path; + + /* + * Our reference's name member points to its full_path member string, so + * we use here a new path. + */ + new_path = fs_path_alloc(); + if (!new_path) + return -ENOMEM; + + ret = get_cur_path(sctx, ref->dir, ref->dir_gen, new_path); + if (ret < 0) { + fs_path_free(new_path); + return ret; + } + ret = fs_path_add(new_path, ref->name, ref->name_len); + if (ret < 0) { + fs_path_free(new_path); + return ret; + } + + fs_path_free(ref->full_path); + set_ref_path(ref, new_path); + + return 0; +} + +/* + * When processing the new references for an inode we may orphanize an existing + * directory inode because its old name conflicts with one of the new references + * of the current inode. Later, when processing another new reference of our + * inode, we might need to orphanize another inode, but the path we have in the + * reference reflects the pre-orphanization name of the directory we previously + * orphanized. For example: + * + * parent snapshot looks like: + * + * . (ino 256) + * |----- f1 (ino 257) + * |----- f2 (ino 258) + * |----- d1/ (ino 259) + * |----- d2/ (ino 260) + * + * send snapshot looks like: + * + * . (ino 256) + * |----- d1 (ino 258) + * |----- f2/ (ino 259) + * |----- f2_link/ (ino 260) + * | |----- f1 (ino 257) + * | + * |----- d2 (ino 258) + * + * When processing inode 257 we compute the name for inode 259 as "d1", and we + * cache it in the name cache. Later when we start processing inode 258, when + * collecting all its new references we set a full path of "d1/d2" for its new + * reference with name "d2". When we start processing the new references we + * start by processing the new reference with name "d1", and this results in + * orphanizing inode 259, since its old reference causes a conflict. Then we + * move on the next new reference, with name "d2", and we find out we must + * orphanize inode 260, as its old reference conflicts with ours - but for the + * orphanization we use a source path corresponding to the path we stored in the + * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the + * receiver fail since the path component "d1/" no longer exists, it was renamed + * to "o259-6-0/" when processing the previous new reference. So in this case we + * must recompute the path in the new reference and use it for the new + * orphanization operation. + */ +static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) +{ + char *name; + int ret; + + name = kmemdup(ref->name, ref->name_len, GFP_KERNEL); + if (!name) + return -ENOMEM; + + fs_path_reset(ref->full_path); + ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path); + if (ret < 0) + goto out; + + ret = fs_path_add(ref->full_path, name, ref->name_len); + if (ret < 0) + goto out; + + /* Update the reference's base name pointer. */ + set_ref_path(ref, ref->full_path); +out: + kfree(name); + return ret; +} + +/* + * This does all the move/link/unlink/rmdir magic. + */ +static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret = 0; + struct recorded_ref *cur; + struct recorded_ref *cur2; + LIST_HEAD(check_dirs); + struct fs_path *valid_path = NULL; + u64 ow_inode = 0; + u64 ow_gen; + u64 ow_mode; + int did_overwrite = 0; + int is_orphan = 0; + u64 last_dir_ino_rm = 0; + bool can_rename = true; + bool orphanized_dir = false; + bool orphanized_ancestor = false; + + btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino); + + /* + * This should never happen as the root dir always has the same ref + * which is always '..' + */ + BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); + + valid_path = fs_path_alloc(); + if (!valid_path) { + ret = -ENOMEM; + goto out; + } + + /* + * First, check if the first ref of the current inode was overwritten + * before. If yes, we know that the current inode was already orphanized + * and thus use the orphan name. If not, we can use get_cur_path to + * get the path of the first ref as it would like while receiving at + * this point in time. + * New inodes are always orphan at the beginning, so force to use the + * orphan name in this case. + * The first ref is stored in valid_path and will be updated if it + * gets moved around. + */ + if (!sctx->cur_inode_new) { + ret = did_overwrite_first_ref(sctx, sctx->cur_ino, + sctx->cur_inode_gen); + if (ret < 0) + goto out; + if (ret) + did_overwrite = 1; + } + if (sctx->cur_inode_new || did_overwrite) { + ret = gen_unique_name(sctx, sctx->cur_ino, + sctx->cur_inode_gen, valid_path); + if (ret < 0) + goto out; + is_orphan = 1; + } else { + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, + valid_path); + if (ret < 0) + goto out; + } + + /* + * Before doing any rename and link operations, do a first pass on the + * new references to orphanize any unprocessed inodes that may have a + * reference that conflicts with one of the new references of the current + * inode. This needs to happen first because a new reference may conflict + * with the old reference of a parent directory, so we must make sure + * that the path used for link and rename commands don't use an + * orphanized name when an ancestor was not yet orphanized. + * + * Example: + * + * Parent snapshot: + * + * . (ino 256) + * |----- testdir/ (ino 259) + * | |----- a (ino 257) + * | + * |----- b (ino 258) + * + * Send snapshot: + * + * . (ino 256) + * |----- testdir_2/ (ino 259) + * | |----- a (ino 260) + * | + * |----- testdir (ino 257) + * |----- b (ino 257) + * |----- b2 (ino 258) + * + * Processing the new reference for inode 257 with name "b" may happen + * before processing the new reference with name "testdir". If so, we + * must make sure that by the time we send a link command to create the + * hard link "b", inode 259 was already orphanized, since the generated + * path in "valid_path" already contains the orphanized name for 259. + * We are processing inode 257, so only later when processing 259 we do + * the rename operation to change its temporary (orphanized) name to + * "testdir_2". + */ + list_for_each_entry(cur, &sctx->new_refs, list) { + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); + if (ret < 0) + goto out; + if (ret == inode_state_will_create) + continue; + + /* + * Check if this new ref would overwrite the first ref of another + * unprocessed inode. If yes, orphanize the overwritten inode. + * If we find an overwritten ref that is not the first ref, + * simply unlink it. + */ + ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen, + cur->name, cur->name_len, + &ow_inode, &ow_gen, &ow_mode); + if (ret < 0) + goto out; + if (ret) { + ret = is_first_ref(sctx->parent_root, + ow_inode, cur->dir, cur->name, + cur->name_len); + if (ret < 0) + goto out; + if (ret) { + struct name_cache_entry *nce; + struct waiting_dir_move *wdm; + + if (orphanized_dir) { + ret = refresh_ref_path(sctx, cur); + if (ret < 0) + goto out; + } + + ret = orphanize_inode(sctx, ow_inode, ow_gen, + cur->full_path); + if (ret < 0) + goto out; + if (S_ISDIR(ow_mode)) + orphanized_dir = true; + + /* + * If ow_inode has its rename operation delayed + * make sure that its orphanized name is used in + * the source path when performing its rename + * operation. + */ + wdm = get_waiting_dir_move(sctx, ow_inode); + if (wdm) + wdm->orphanized = true; + + /* + * Make sure we clear our orphanized inode's + * name from the name cache. This is because the + * inode ow_inode might be an ancestor of some + * other inode that will be orphanized as well + * later and has an inode number greater than + * sctx->send_progress. We need to prevent + * future name lookups from using the old name + * and get instead the orphan name. + */ + nce = name_cache_search(sctx, ow_inode, ow_gen); + if (nce) + btrfs_lru_cache_remove(&sctx->name_cache, + &nce->entry); + + /* + * ow_inode might currently be an ancestor of + * cur_ino, therefore compute valid_path (the + * current path of cur_ino) again because it + * might contain the pre-orphanization name of + * ow_inode, which is no longer valid. + */ + ret = is_ancestor(sctx->parent_root, + ow_inode, ow_gen, + sctx->cur_ino, NULL); + if (ret > 0) { + orphanized_ancestor = true; + fs_path_reset(valid_path); + ret = get_cur_path(sctx, sctx->cur_ino, + sctx->cur_inode_gen, + valid_path); + } + if (ret < 0) + goto out; + } else { + /* + * If we previously orphanized a directory that + * collided with a new reference that we already + * processed, recompute the current path because + * that directory may be part of the path. + */ + if (orphanized_dir) { + ret = refresh_ref_path(sctx, cur); + if (ret < 0) + goto out; + } + ret = send_unlink(sctx, cur->full_path); + if (ret < 0) + goto out; + } + } + + } + + list_for_each_entry(cur, &sctx->new_refs, list) { + /* + * We may have refs where the parent directory does not exist + * yet. This happens if the parent directories inum is higher + * than the current inum. To handle this case, we create the + * parent directory out of order. But we need to check if this + * did already happen before due to other refs in the same dir. + */ + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); + if (ret < 0) + goto out; + if (ret == inode_state_will_create) { + ret = 0; + /* + * First check if any of the current inodes refs did + * already create the dir. + */ + list_for_each_entry(cur2, &sctx->new_refs, list) { + if (cur == cur2) + break; + if (cur2->dir == cur->dir) { + ret = 1; + break; + } + } + + /* + * If that did not happen, check if a previous inode + * did already create the dir. + */ + if (!ret) + ret = did_create_dir(sctx, cur->dir); + if (ret < 0) + goto out; + if (!ret) { + ret = send_create_inode(sctx, cur->dir); + if (ret < 0) + goto out; + cache_dir_created(sctx, cur->dir); + } + } + + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { + ret = wait_for_dest_dir_move(sctx, cur, is_orphan); + if (ret < 0) + goto out; + if (ret == 1) { + can_rename = false; + *pending_move = 1; + } + } + + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root && + can_rename) { + ret = wait_for_parent_move(sctx, cur, is_orphan); + if (ret < 0) + goto out; + if (ret == 1) { + can_rename = false; + *pending_move = 1; + } + } + + /* + * link/move the ref to the new place. If we have an orphan + * inode, move it and update valid_path. If not, link or move + * it depending on the inode mode. + */ + if (is_orphan && can_rename) { + ret = send_rename(sctx, valid_path, cur->full_path); + if (ret < 0) + goto out; + is_orphan = 0; + ret = fs_path_copy(valid_path, cur->full_path); + if (ret < 0) + goto out; + } else if (can_rename) { + if (S_ISDIR(sctx->cur_inode_mode)) { + /* + * Dirs can't be linked, so move it. For moved + * dirs, we always have one new and one deleted + * ref. The deleted ref is ignored later. + */ + ret = send_rename(sctx, valid_path, + cur->full_path); + if (!ret) + ret = fs_path_copy(valid_path, + cur->full_path); + if (ret < 0) + goto out; + } else { + /* + * We might have previously orphanized an inode + * which is an ancestor of our current inode, + * so our reference's full path, which was + * computed before any such orphanizations, must + * be updated. + */ + if (orphanized_dir) { + ret = update_ref_path(sctx, cur); + if (ret < 0) + goto out; + } + ret = send_link(sctx, cur->full_path, + valid_path); + if (ret < 0) + goto out; + } + } + ret = dup_ref(cur, &check_dirs); + if (ret < 0) + goto out; + } + + if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) { + /* + * Check if we can already rmdir the directory. If not, + * orphanize it. For every dir item inside that gets deleted + * later, we do this check again and rmdir it then if possible. + * See the use of check_dirs for more details. + */ + ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen); + if (ret < 0) + goto out; + if (ret) { + ret = send_rmdir(sctx, valid_path); + if (ret < 0) + goto out; + } else if (!is_orphan) { + ret = orphanize_inode(sctx, sctx->cur_ino, + sctx->cur_inode_gen, valid_path); + if (ret < 0) + goto out; + is_orphan = 1; + } + + list_for_each_entry(cur, &sctx->deleted_refs, list) { + ret = dup_ref(cur, &check_dirs); + if (ret < 0) + goto out; + } + } else if (S_ISDIR(sctx->cur_inode_mode) && + !list_empty(&sctx->deleted_refs)) { + /* + * We have a moved dir. Add the old parent to check_dirs + */ + cur = list_entry(sctx->deleted_refs.next, struct recorded_ref, + list); + ret = dup_ref(cur, &check_dirs); + if (ret < 0) + goto out; + } else if (!S_ISDIR(sctx->cur_inode_mode)) { + /* + * We have a non dir inode. Go through all deleted refs and + * unlink them if they were not already overwritten by other + * inodes. + */ + list_for_each_entry(cur, &sctx->deleted_refs, list) { + ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen, + sctx->cur_ino, sctx->cur_inode_gen, + cur->name, cur->name_len); + if (ret < 0) + goto out; + if (!ret) { + /* + * If we orphanized any ancestor before, we need + * to recompute the full path for deleted names, + * since any such path was computed before we + * processed any references and orphanized any + * ancestor inode. + */ + if (orphanized_ancestor) { + ret = update_ref_path(sctx, cur); + if (ret < 0) + goto out; + } + ret = send_unlink(sctx, cur->full_path); + if (ret < 0) + goto out; + } + ret = dup_ref(cur, &check_dirs); + if (ret < 0) + goto out; + } + /* + * If the inode is still orphan, unlink the orphan. This may + * happen when a previous inode did overwrite the first ref + * of this inode and no new refs were added for the current + * inode. Unlinking does not mean that the inode is deleted in + * all cases. There may still be links to this inode in other + * places. + */ + if (is_orphan) { + ret = send_unlink(sctx, valid_path); + if (ret < 0) + goto out; + } + } + + /* + * We did collect all parent dirs where cur_inode was once located. We + * now go through all these dirs and check if they are pending for + * deletion and if it's finally possible to perform the rmdir now. + * We also update the inode stats of the parent dirs here. + */ + list_for_each_entry(cur, &check_dirs, list) { + /* + * In case we had refs into dirs that were not processed yet, + * we don't need to do the utime and rmdir logic for these dirs. + * The dir will be processed later. + */ + if (cur->dir > sctx->cur_ino) + continue; + + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); + if (ret < 0) + goto out; + + if (ret == inode_state_did_create || + ret == inode_state_no_change) { + ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + } else if (ret == inode_state_did_delete && + cur->dir != last_dir_ino_rm) { + ret = can_rmdir(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + if (ret) { + ret = get_cur_path(sctx, cur->dir, + cur->dir_gen, valid_path); + if (ret < 0) + goto out; + ret = send_rmdir(sctx, valid_path); + if (ret < 0) + goto out; + last_dir_ino_rm = cur->dir; + } + } + } + + ret = 0; + +out: + __free_recorded_refs(&check_dirs); + free_recorded_refs(sctx); + fs_path_free(valid_path); + return ret; +} + +static int rbtree_ref_comp(const void *k, const struct rb_node *node) +{ + const struct recorded_ref *data = k; + const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node); + int result; + + if (data->dir > ref->dir) + return 1; + if (data->dir < ref->dir) + return -1; + if (data->dir_gen > ref->dir_gen) + return 1; + if (data->dir_gen < ref->dir_gen) + return -1; + if (data->name_len > ref->name_len) + return 1; + if (data->name_len < ref->name_len) + return -1; + result = strcmp(data->name, ref->name); + if (result > 0) + return 1; + if (result < 0) + return -1; + return 0; +} + +static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent) +{ + const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node); + + return rbtree_ref_comp(entry, parent) < 0; +} + +static int record_ref_in_tree(struct rb_root *root, struct list_head *refs, + struct fs_path *name, u64 dir, u64 dir_gen, + struct send_ctx *sctx) +{ + int ret = 0; + struct fs_path *path = NULL; + struct recorded_ref *ref = NULL; + + path = fs_path_alloc(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ref = recorded_ref_alloc(); + if (!ref) { + ret = -ENOMEM; + goto out; + } + + ret = get_cur_path(sctx, dir, dir_gen, path); + if (ret < 0) + goto out; + ret = fs_path_add_path(path, name); + if (ret < 0) + goto out; + + ref->dir = dir; + ref->dir_gen = dir_gen; + set_ref_path(ref, path); + list_add_tail(&ref->list, refs); + rb_add(&ref->node, root, rbtree_ref_less); + ref->root = root; +out: + if (ret) { + if (path && (!ref || !ref->full_path)) + fs_path_free(path); + recorded_ref_free(ref); + } + return ret; +} + +static int record_new_ref_if_needed(int num, u64 dir, int index, + struct fs_path *name, void *ctx) +{ + int ret = 0; + struct send_ctx *sctx = ctx; + struct rb_node *node = NULL; + struct recorded_ref data; + struct recorded_ref *ref; + u64 dir_gen; + + ret = get_inode_gen(sctx->send_root, dir, &dir_gen); + if (ret < 0) + goto out; + + data.dir = dir; + data.dir_gen = dir_gen; + set_ref_path(&data, name); + node = rb_find(&data, &sctx->rbtree_deleted_refs, rbtree_ref_comp); + if (node) { + ref = rb_entry(node, struct recorded_ref, node); + recorded_ref_free(ref); + } else { + ret = record_ref_in_tree(&sctx->rbtree_new_refs, + &sctx->new_refs, name, dir, dir_gen, + sctx); + } +out: + return ret; +} + +static int record_deleted_ref_if_needed(int num, u64 dir, int index, + struct fs_path *name, void *ctx) +{ + int ret = 0; + struct send_ctx *sctx = ctx; + struct rb_node *node = NULL; + struct recorded_ref data; + struct recorded_ref *ref; + u64 dir_gen; + + ret = get_inode_gen(sctx->parent_root, dir, &dir_gen); + if (ret < 0) + goto out; + + data.dir = dir; + data.dir_gen = dir_gen; + set_ref_path(&data, name); + node = rb_find(&data, &sctx->rbtree_new_refs, rbtree_ref_comp); + if (node) { + ref = rb_entry(node, struct recorded_ref, node); + recorded_ref_free(ref); + } else { + ret = record_ref_in_tree(&sctx->rbtree_deleted_refs, + &sctx->deleted_refs, name, dir, + dir_gen, sctx); + } +out: + return ret; +} + +static int record_new_ref(struct send_ctx *sctx) +{ + int ret; + + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, + sctx->cmp_key, 0, record_new_ref_if_needed, sctx); + if (ret < 0) + goto out; + ret = 0; + +out: + return ret; +} + +static int record_deleted_ref(struct send_ctx *sctx) +{ + int ret; + + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, 0, record_deleted_ref_if_needed, + sctx); + if (ret < 0) + goto out; + ret = 0; + +out: + return ret; +} + +static int record_changed_ref(struct send_ctx *sctx) +{ + int ret = 0; + + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, + sctx->cmp_key, 0, record_new_ref_if_needed, sctx); + if (ret < 0) + goto out; + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); + if (ret < 0) + goto out; + ret = 0; + +out: + return ret; +} + +/* + * Record and process all refs at once. Needed when an inode changes the + * generation number, which means that it was deleted and recreated. + */ +static int process_all_refs(struct send_ctx *sctx, + enum btrfs_compare_tree_result cmd) +{ + int ret = 0; + int iter_ret = 0; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + iterate_inode_ref_t cb; + int pending_move = 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + if (cmd == BTRFS_COMPARE_TREE_NEW) { + root = sctx->send_root; + cb = record_new_ref_if_needed; + } else if (cmd == BTRFS_COMPARE_TREE_DELETED) { + root = sctx->parent_root; + cb = record_deleted_ref_if_needed; + } else { + btrfs_err(sctx->send_root->fs_info, + "Wrong command %d in process_all_refs", cmd); + ret = -EINVAL; + goto out; + } + + key.objectid = sctx->cmp_key->objectid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { + if (found_key.objectid != key.objectid || + (found_key.type != BTRFS_INODE_REF_KEY && + found_key.type != BTRFS_INODE_EXTREF_KEY)) + break; + + ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); + if (ret < 0) + goto out; + } + /* Catch error found during iteration */ + if (iter_ret < 0) { + ret = iter_ret; + goto out; + } + btrfs_release_path(path); + + /* + * We don't actually care about pending_move as we are simply + * re-creating this inode and will be rename'ing it into place once we + * rename the parent directory. + */ + ret = process_recorded_refs(sctx, &pending_move); +out: + btrfs_free_path(path); + return ret; +} + +static int send_set_xattr(struct send_ctx *sctx, + struct fs_path *path, + const char *name, int name_len, + const char *data, int data_len) +{ + int ret = 0; + + ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); + TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +static int send_remove_xattr(struct send_ctx *sctx, + struct fs_path *path, + const char *name, int name_len) +{ + int ret = 0; + + ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +static int __process_new_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, const char *data, + int data_len, void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + struct fs_path *p; + struct posix_acl_xattr_header dummy_acl; + + /* Capabilities are emitted by finish_inode_if_needed */ + if (!strncmp(name, XATTR_NAME_CAPS, name_len)) + return 0; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + /* + * This hack is needed because empty acls are stored as zero byte + * data in xattrs. Problem with that is, that receiving these zero byte + * acls will fail later. To fix this, we send a dummy acl list that + * only contains the version number and no entries. + */ + if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) || + !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) { + if (data_len == 0) { + dummy_acl.a_version = + cpu_to_le32(POSIX_ACL_XATTR_VERSION); + data = (char *)&dummy_acl; + data_len = sizeof(dummy_acl); + } + } + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + ret = send_set_xattr(sctx, p, name, name_len, data, data_len); + +out: + fs_path_free(p); + return ret; +} + +static int __process_deleted_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + struct fs_path *p; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + ret = send_remove_xattr(sctx, p, name, name_len); + +out: + fs_path_free(p); + return ret; +} + +static int process_new_xattr(struct send_ctx *sctx) +{ + int ret = 0; + + ret = iterate_dir_item(sctx->send_root, sctx->left_path, + __process_new_xattr, sctx); + + return ret; +} + +static int process_deleted_xattr(struct send_ctx *sctx) +{ + return iterate_dir_item(sctx->parent_root, sctx->right_path, + __process_deleted_xattr, sctx); +} + +struct find_xattr_ctx { + const char *name; + int name_len; + int found_idx; + char *found_data; + int found_data_len; +}; + +static int __find_xattr(int num, struct btrfs_key *di_key, const char *name, + int name_len, const char *data, int data_len, void *vctx) +{ + struct find_xattr_ctx *ctx = vctx; + + if (name_len == ctx->name_len && + strncmp(name, ctx->name, name_len) == 0) { + ctx->found_idx = num; + ctx->found_data_len = data_len; + ctx->found_data = kmemdup(data, data_len, GFP_KERNEL); + if (!ctx->found_data) + return -ENOMEM; + return 1; + } + return 0; +} + +static int find_xattr(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + const char *name, int name_len, + char **data, int *data_len) +{ + int ret; + struct find_xattr_ctx ctx; + + ctx.name = name; + ctx.name_len = name_len; + ctx.found_idx = -1; + ctx.found_data = NULL; + ctx.found_data_len = 0; + + ret = iterate_dir_item(root, path, __find_xattr, &ctx); + if (ret < 0) + return ret; + + if (ctx.found_idx == -1) + return -ENOENT; + if (data) { + *data = ctx.found_data; + *data_len = ctx.found_data_len; + } else { + kfree(ctx.found_data); + } + return ctx.found_idx; +} + + +static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + char *found_data = NULL; + int found_data_len = 0; + + ret = find_xattr(sctx->parent_root, sctx->right_path, + sctx->cmp_key, name, name_len, &found_data, + &found_data_len); + if (ret == -ENOENT) { + ret = __process_new_xattr(num, di_key, name, name_len, data, + data_len, ctx); + } else if (ret >= 0) { + if (data_len != found_data_len || + memcmp(data, found_data, data_len)) { + ret = __process_new_xattr(num, di_key, name, name_len, + data, data_len, ctx); + } else { + ret = 0; + } + } + + kfree(found_data); + return ret; +} + +static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + + ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key, + name, name_len, NULL, NULL); + if (ret == -ENOENT) + ret = __process_deleted_xattr(num, di_key, name, name_len, data, + data_len, ctx); + else if (ret >= 0) + ret = 0; + + return ret; +} + +static int process_changed_xattr(struct send_ctx *sctx) +{ + int ret = 0; + + ret = iterate_dir_item(sctx->send_root, sctx->left_path, + __process_changed_new_xattr, sctx); + if (ret < 0) + goto out; + ret = iterate_dir_item(sctx->parent_root, sctx->right_path, + __process_changed_deleted_xattr, sctx); + +out: + return ret; +} + +static int process_all_new_xattrs(struct send_ctx *sctx) +{ + int ret = 0; + int iter_ret = 0; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + root = sctx->send_root; + + key.objectid = sctx->cmp_key->objectid; + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = 0; + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { + if (found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + break; + } + + ret = iterate_dir_item(root, path, __process_new_xattr, sctx); + if (ret < 0) + break; + } + /* Catch error found during iteration */ + if (iter_ret < 0) + ret = iter_ret; + + btrfs_free_path(path); + return ret; +} + +static int send_verity(struct send_ctx *sctx, struct fs_path *path, + struct fsverity_descriptor *desc) +{ + int ret; + + ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM, + le8_to_cpu(desc->hash_algorithm)); + TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE, + 1U << le8_to_cpu(desc->log_blocksize)); + TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt, + le8_to_cpu(desc->salt_size)); + TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature, + le32_to_cpu(desc->sig_size)); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +static int process_verity(struct send_ctx *sctx) +{ + int ret = 0; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + struct inode *inode; + struct fs_path *p; + + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + ret = btrfs_get_verity_descriptor(inode, NULL, 0); + if (ret < 0) + goto iput; + + if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) { + ret = -EMSGSIZE; + goto iput; + } + if (!sctx->verity_descriptor) { + sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE, + GFP_KERNEL); + if (!sctx->verity_descriptor) { + ret = -ENOMEM; + goto iput; + } + } + + ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret); + if (ret < 0) + goto iput; + + p = fs_path_alloc(); + if (!p) { + ret = -ENOMEM; + goto iput; + } + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto free_path; + + ret = send_verity(sctx, p, sctx->verity_descriptor); + if (ret < 0) + goto free_path; + +free_path: + fs_path_free(p); +iput: + iput(inode); + return ret; +} + +static inline u64 max_send_read_size(const struct send_ctx *sctx) +{ + return sctx->send_max_size - SZ_16K; +} + +static int put_data_header(struct send_ctx *sctx, u32 len) +{ + if (WARN_ON_ONCE(sctx->put_data)) + return -EINVAL; + sctx->put_data = true; + if (sctx->proto >= 2) { + /* + * Since v2, the data attribute header doesn't include a length, + * it is implicitly to the end of the command. + */ + if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len) + return -EOVERFLOW; + put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size); + sctx->send_size += sizeof(__le16); + } else { + struct btrfs_tlv_header *hdr; + + if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) + return -EOVERFLOW; + hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); + put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); + sctx->send_size += sizeof(*hdr); + } + return 0; +} + +static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) +{ + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct page *page; + pgoff_t index = offset >> PAGE_SHIFT; + pgoff_t last_index; + unsigned pg_offset = offset_in_page(offset); + int ret; + + ret = put_data_header(sctx, len); + if (ret) + return ret; + + last_index = (offset + len - 1) >> PAGE_SHIFT; + + while (index <= last_index) { + unsigned cur_len = min_t(unsigned, len, + PAGE_SIZE - pg_offset); + + page = find_lock_page(sctx->cur_inode->i_mapping, index); + if (!page) { + page_cache_sync_readahead(sctx->cur_inode->i_mapping, + &sctx->ra, NULL, index, + last_index + 1 - index); + + page = find_or_create_page(sctx->cur_inode->i_mapping, + index, GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + break; + } + } + + if (PageReadahead(page)) + page_cache_async_readahead(sctx->cur_inode->i_mapping, + &sctx->ra, NULL, page_folio(page), + index, last_index + 1 - index); + + if (!PageUptodate(page)) { + btrfs_read_folio(NULL, page_folio(page)); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + btrfs_err(fs_info, + "send: IO error at offset %llu for inode %llu root %llu", + page_offset(page), sctx->cur_ino, + sctx->send_root->root_key.objectid); + put_page(page); + ret = -EIO; + break; + } + } + + memcpy_from_page(sctx->send_buf + sctx->send_size, page, + pg_offset, cur_len); + unlock_page(page); + put_page(page); + index++; + pg_offset = 0; + len -= cur_len; + sctx->send_size += cur_len; + } + + return ret; +} + +/* + * Read some bytes from the current inode/file and send a write command to + * user space. + */ +static int send_write(struct send_ctx *sctx, u64 offset, u32 len) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret = 0; + struct fs_path *p; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); + + ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + ret = put_file_data(sctx, offset, len); + if (ret < 0) + goto out; + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +/* + * Send a clone command to user space. + */ +static int send_clone(struct send_ctx *sctx, + u64 offset, u32 len, + struct clone_root *clone_root) +{ + int ret = 0; + struct fs_path *p; + u64 gen; + + btrfs_debug(sctx->send_root->fs_info, + "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu", + offset, len, clone_root->root->root_key.objectid, + clone_root->ino, clone_root->offset); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + + if (clone_root->root == sctx->send_root) { + ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen); + if (ret < 0) + goto out; + ret = get_cur_path(sctx, clone_root->ino, gen, p); + } else { + ret = get_inode_path(clone_root->root, clone_root->ino, p); + } + if (ret < 0) + goto out; + + /* + * If the parent we're using has a received_uuid set then use that as + * our clone source as that is what we will look for when doing a + * receive. + * + * This covers the case that we create a snapshot off of a received + * subvolume and then use that as the parent and try to receive on a + * different host. + */ + if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.uuid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, + btrfs_root_ctransid(&clone_root->root->root_item)); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, + clone_root->offset); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +/* + * Send an update extent command to user space. + */ +static int send_update_extent(struct send_ctx *sctx, + u64 offset, u32 len) +{ + int ret = 0; + struct fs_path *p; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +static int send_hole(struct send_ctx *sctx, u64 end) +{ + struct fs_path *p = NULL; + u64 read_size = max_send_read_size(sctx); + u64 offset = sctx->cur_inode_last_extent; + int ret = 0; + + /* + * A hole that starts at EOF or beyond it. Since we do not yet support + * fallocate (for extent preallocation and hole punching), sending a + * write of zeroes starting at EOF or beyond would later require issuing + * a truncate operation which would undo the write and achieve nothing. + */ + if (offset >= sctx->cur_inode_size) + return 0; + + /* + * Don't go beyond the inode's i_size due to prealloc extents that start + * after the i_size. + */ + end = min_t(u64, end, sctx->cur_inode_size); + + if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) + return send_update_extent(sctx, offset, end - offset); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto tlv_put_failure; + while (offset < end) { + u64 len = min(end - offset, read_size); + + ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); + if (ret < 0) + break; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + ret = put_data_header(sctx, len); + if (ret < 0) + break; + memset(sctx->send_buf + sctx->send_size, 0, len); + sctx->send_size += len; + ret = send_cmd(sctx); + if (ret < 0) + break; + offset += len; + } + sctx->cur_inode_next_write_offset = offset; +tlv_put_failure: + fs_path_free(p); + return ret; +} + +static int send_encoded_inline_extent(struct send_ctx *sctx, + struct btrfs_path *path, u64 offset, + u64 len) +{ + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode; + struct fs_path *fspath; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_key key; + struct btrfs_file_extent_item *ei; + u64 ram_bytes; + size_t inline_size; + int ret; + + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + fspath = fs_path_alloc(); + if (!fspath) { + ret = -ENOMEM; + goto out; + } + + ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); + if (ret < 0) + goto out; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + ram_bytes = btrfs_file_extent_ram_bytes(leaf, ei); + inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, + min(key.offset + ram_bytes - offset, len)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset); + ret = btrfs_encoded_io_compression_from_extent(fs_info, + btrfs_file_extent_compression(leaf, ei)); + if (ret < 0) + goto out; + TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); + + ret = put_data_header(sctx, inline_size); + if (ret < 0) + goto out; + read_extent_buffer(leaf, sctx->send_buf + sctx->send_size, + btrfs_file_extent_inline_start(ei), inline_size); + sctx->send_size += inline_size; + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(fspath); + iput(inode); + return ret; +} + +static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, + u64 offset, u64 len) +{ + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode; + struct fs_path *fspath; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_key key; + struct btrfs_file_extent_item *ei; + u64 disk_bytenr, disk_num_bytes; + u32 data_offset; + struct btrfs_cmd_header *hdr; + u32 crc; + int ret; + + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + fspath = fs_path_alloc(); + if (!fspath) { + ret = -ENOMEM; + goto out; + } + + ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); + if (ret < 0) + goto out; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, ei); + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, + min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset, + len)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, + btrfs_file_extent_ram_bytes(leaf, ei)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, + offset - key.offset + btrfs_file_extent_offset(leaf, ei)); + ret = btrfs_encoded_io_compression_from_extent(fs_info, + btrfs_file_extent_compression(leaf, ei)); + if (ret < 0) + goto out; + TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); + TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, 0); + + ret = put_data_header(sctx, disk_num_bytes); + if (ret < 0) + goto out; + + /* + * We want to do I/O directly into the send buffer, so get the next page + * boundary in the send buffer. This means that there may be a gap + * between the beginning of the command and the file data. + */ + data_offset = PAGE_ALIGN(sctx->send_size); + if (data_offset > sctx->send_max_size || + sctx->send_max_size - data_offset < disk_num_bytes) { + ret = -EOVERFLOW; + goto out; + } + + /* + * Note that send_buf is a mapping of send_buf_pages, so this is really + * reading into send_buf. + */ + ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset, + disk_bytenr, disk_num_bytes, + sctx->send_buf_pages + + (data_offset >> PAGE_SHIFT)); + if (ret) + goto out; + + hdr = (struct btrfs_cmd_header *)sctx->send_buf; + hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr)); + hdr->crc = 0; + crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size); + crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes); + hdr->crc = cpu_to_le32(crc); + + ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, + &sctx->send_off); + if (!ret) { + ret = write_buf(sctx->send_filp, sctx->send_buf + data_offset, + disk_num_bytes, &sctx->send_off); + } + sctx->send_size = 0; + sctx->put_data = false; + +tlv_put_failure: +out: + fs_path_free(fspath); + iput(inode); + return ret; +} + +static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, + const u64 offset, const u64 len) +{ + const u64 end = offset + len; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *ei; + u64 read_size = max_send_read_size(sctx); + u64 sent = 0; + + if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) + return send_update_extent(sctx, offset, len); + + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && + btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { + bool is_inline = (btrfs_file_extent_type(leaf, ei) == + BTRFS_FILE_EXTENT_INLINE); + + /* + * Send the compressed extent unless the compressed data is + * larger than the decompressed data. This can happen if we're + * not sending the entire extent, either because it has been + * partially overwritten/truncated or because this is a part of + * the extent that we couldn't clone in clone_range(). + */ + if (is_inline && + btrfs_file_extent_inline_item_len(leaf, + path->slots[0]) <= len) { + return send_encoded_inline_extent(sctx, path, offset, + len); + } else if (!is_inline && + btrfs_file_extent_disk_num_bytes(leaf, ei) <= len) { + return send_encoded_extent(sctx, path, offset, len); + } + } + + if (sctx->cur_inode == NULL) { + struct btrfs_root *root = sctx->send_root; + + sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root); + if (IS_ERR(sctx->cur_inode)) { + int err = PTR_ERR(sctx->cur_inode); + + sctx->cur_inode = NULL; + return err; + } + memset(&sctx->ra, 0, sizeof(struct file_ra_state)); + file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping); + + /* + * It's very likely there are no pages from this inode in the page + * cache, so after reading extents and sending their data, we clean + * the page cache to avoid trashing the page cache (adding pressure + * to the page cache and forcing eviction of other data more useful + * for applications). + * + * We decide if we should clean the page cache simply by checking + * if the inode's mapping nrpages is 0 when we first open it, and + * not by using something like filemap_range_has_page() before + * reading an extent because when we ask the readahead code to + * read a given file range, it may (and almost always does) read + * pages from beyond that range (see the documentation for + * page_cache_sync_readahead()), so it would not be reliable, + * because after reading the first extent future calls to + * filemap_range_has_page() would return true because the readahead + * on the previous extent resulted in reading pages of the current + * extent as well. + */ + sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0); + sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE); + } + + while (sent < len) { + u64 size = min(len - sent, read_size); + int ret; + + ret = send_write(sctx, offset + sent, size); + if (ret < 0) + return ret; + sent += size; + } + + if (sctx->clean_page_cache && PAGE_ALIGNED(end)) { + /* + * Always operate only on ranges that are a multiple of the page + * size. This is not only to prevent zeroing parts of a page in + * the case of subpage sector size, but also to guarantee we evict + * pages, as passing a range that is smaller than page size does + * not evict the respective page (only zeroes part of its content). + * + * Always start from the end offset of the last range cleared. + * This is because the readahead code may (and very often does) + * reads pages beyond the range we request for readahead. So if + * we have an extent layout like this: + * + * [ extent A ] [ extent B ] [ extent C ] + * + * When we ask page_cache_sync_readahead() to read extent A, it + * may also trigger reads for pages of extent B. If we are doing + * an incremental send and extent B has not changed between the + * parent and send snapshots, some or all of its pages may end + * up being read and placed in the page cache. So when truncating + * the page cache we always start from the end offset of the + * previously processed extent up to the end of the current + * extent. + */ + truncate_inode_pages_range(&sctx->cur_inode->i_data, + sctx->page_cache_clear_start, + end - 1); + sctx->page_cache_clear_start = end; + } + + return 0; +} + +/* + * Search for a capability xattr related to sctx->cur_ino. If the capability is + * found, call send_set_xattr function to emit it. + * + * Return 0 if there isn't a capability, or when the capability was emitted + * successfully, or < 0 if an error occurred. + */ +static int send_capabilities(struct send_ctx *sctx) +{ + struct fs_path *fspath = NULL; + struct btrfs_path *path; + struct btrfs_dir_item *di; + struct extent_buffer *leaf; + unsigned long data_ptr; + char *buf = NULL; + int buf_len; + int ret = 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino, + XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0); + if (!di) { + /* There is no xattr for this inode */ + goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + leaf = path->nodes[0]; + buf_len = btrfs_dir_data_len(leaf, di); + + fspath = fs_path_alloc(); + buf = kmalloc(buf_len, GFP_KERNEL); + if (!fspath || !buf) { + ret = -ENOMEM; + goto out; + } + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); + if (ret < 0) + goto out; + + data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); + read_extent_buffer(leaf, buf, data_ptr, buf_len); + + ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS, + strlen(XATTR_NAME_CAPS), buf, buf_len); +out: + kfree(buf); + fs_path_free(fspath); + btrfs_free_path(path); + return ret; +} + +static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, + struct clone_root *clone_root, const u64 disk_byte, + u64 data_offset, u64 offset, u64 len) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + struct btrfs_inode_info info; + u64 clone_src_i_size = 0; + + /* + * Prevent cloning from a zero offset with a length matching the sector + * size because in some scenarios this will make the receiver fail. + * + * For example, if in the source filesystem the extent at offset 0 + * has a length of sectorsize and it was written using direct IO, then + * it can never be an inline extent (even if compression is enabled). + * Then this extent can be cloned in the original filesystem to a non + * zero file offset, but it may not be possible to clone in the + * destination filesystem because it can be inlined due to compression + * on the destination filesystem (as the receiver's write operations are + * always done using buffered IO). The same happens when the original + * filesystem does not have compression enabled but the destination + * filesystem has. + */ + if (clone_root->offset == 0 && + len == sctx->send_root->fs_info->sectorsize) + return send_extent_data(sctx, dst_path, offset, len); + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + /* + * There are inodes that have extents that lie behind its i_size. Don't + * accept clones from these extents. + */ + ret = get_inode_info(clone_root->root, clone_root->ino, &info); + btrfs_release_path(path); + if (ret < 0) + goto out; + clone_src_i_size = info.size; + + /* + * We can't send a clone operation for the entire range if we find + * extent items in the respective range in the source file that + * refer to different extents or if we find holes. + * So check for that and do a mix of clone and regular write/copy + * operations if needed. + * + * Example: + * + * mkfs.btrfs -f /dev/sda + * mount /dev/sda /mnt + * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo + * cp --reflink=always /mnt/foo /mnt/bar + * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo + * btrfs subvolume snapshot -r /mnt /mnt/snap + * + * If when we send the snapshot and we are processing file bar (which + * has a higher inode number than foo) we blindly send a clone operation + * for the [0, 100K[ range from foo to bar, the receiver ends up getting + * a file bar that matches the content of file foo - iow, doesn't match + * the content from bar in the original filesystem. + */ + key.objectid = clone_root->ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = clone_root->offset; + ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == clone_root->ino && + key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_file_extent_item *ei; + u8 type; + u64 ext_len; + u64 clone_len; + u64 clone_data_offset; + bool crossed_src_i_size = false; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(clone_root->root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + + /* + * We might have an implicit trailing hole (NO_HOLES feature + * enabled). We deal with it after leaving this loop. + */ + if (key.objectid != clone_root->ino || + key.type != BTRFS_EXTENT_DATA_KEY) + break; + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + type = btrfs_file_extent_type(leaf, ei); + if (type == BTRFS_FILE_EXTENT_INLINE) { + ext_len = btrfs_file_extent_ram_bytes(leaf, ei); + ext_len = PAGE_ALIGN(ext_len); + } else { + ext_len = btrfs_file_extent_num_bytes(leaf, ei); + } + + if (key.offset + ext_len <= clone_root->offset) + goto next; + + if (key.offset > clone_root->offset) { + /* Implicit hole, NO_HOLES feature enabled. */ + u64 hole_len = key.offset - clone_root->offset; + + if (hole_len > len) + hole_len = len; + ret = send_extent_data(sctx, dst_path, offset, + hole_len); + if (ret < 0) + goto out; + + len -= hole_len; + if (len == 0) + break; + offset += hole_len; + clone_root->offset += hole_len; + data_offset += hole_len; + } + + if (key.offset >= clone_root->offset + len) + break; + + if (key.offset >= clone_src_i_size) + break; + + if (key.offset + ext_len > clone_src_i_size) { + ext_len = clone_src_i_size - key.offset; + crossed_src_i_size = true; + } + + clone_data_offset = btrfs_file_extent_offset(leaf, ei); + if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) { + clone_root->offset = key.offset; + if (clone_data_offset < data_offset && + clone_data_offset + ext_len > data_offset) { + u64 extent_offset; + + extent_offset = data_offset - clone_data_offset; + ext_len -= extent_offset; + clone_data_offset += extent_offset; + clone_root->offset += extent_offset; + } + } + + clone_len = min_t(u64, ext_len, len); + + if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte && + clone_data_offset == data_offset) { + const u64 src_end = clone_root->offset + clone_len; + const u64 sectorsize = SZ_64K; + + /* + * We can't clone the last block, when its size is not + * sector size aligned, into the middle of a file. If we + * do so, the receiver will get a failure (-EINVAL) when + * trying to clone or will silently corrupt the data in + * the destination file if it's on a kernel without the + * fix introduced by commit ac765f83f1397646 + * ("Btrfs: fix data corruption due to cloning of eof + * block). + * + * So issue a clone of the aligned down range plus a + * regular write for the eof block, if we hit that case. + * + * Also, we use the maximum possible sector size, 64K, + * because we don't know what's the sector size of the + * filesystem that receives the stream, so we have to + * assume the largest possible sector size. + */ + if (src_end == clone_src_i_size && + !IS_ALIGNED(src_end, sectorsize) && + offset + clone_len < sctx->cur_inode_size) { + u64 slen; + + slen = ALIGN_DOWN(src_end - clone_root->offset, + sectorsize); + if (slen > 0) { + ret = send_clone(sctx, offset, slen, + clone_root); + if (ret < 0) + goto out; + } + ret = send_extent_data(sctx, dst_path, + offset + slen, + clone_len - slen); + } else { + ret = send_clone(sctx, offset, clone_len, + clone_root); + } + } else if (crossed_src_i_size && clone_len < len) { + /* + * If we are at i_size of the clone source inode and we + * can not clone from it, terminate the loop. This is + * to avoid sending two write operations, one with a + * length matching clone_len and the final one after + * this loop with a length of len - clone_len. + * + * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED + * was passed to the send ioctl), this helps avoid + * sending an encoded write for an offset that is not + * sector size aligned, in case the i_size of the source + * inode is not sector size aligned. That will make the + * receiver fallback to decompression of the data and + * writing it using regular buffered IO, therefore while + * not incorrect, it's not optimal due decompression and + * possible re-compression at the receiver. + */ + break; + } else { + ret = send_extent_data(sctx, dst_path, offset, + clone_len); + } + + if (ret < 0) + goto out; + + len -= clone_len; + if (len == 0) + break; + offset += clone_len; + clone_root->offset += clone_len; + + /* + * If we are cloning from the file we are currently processing, + * and using the send root as the clone root, we must stop once + * the current clone offset reaches the current eof of the file + * at the receiver, otherwise we would issue an invalid clone + * operation (source range going beyond eof) and cause the + * receiver to fail. So if we reach the current eof, bail out + * and fallback to a regular write. + */ + if (clone_root->root == sctx->send_root && + clone_root->ino == sctx->cur_ino && + clone_root->offset >= sctx->cur_inode_next_write_offset) + break; + + data_offset += clone_len; +next: + path->slots[0]++; + } + + if (len > 0) + ret = send_extent_data(sctx, dst_path, offset, len); + else + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static int send_write_or_clone(struct send_ctx *sctx, + struct btrfs_path *path, + struct btrfs_key *key, + struct clone_root *clone_root) +{ + int ret = 0; + u64 offset = key->offset; + u64 end; + u64 bs = sctx->send_root->fs_info->sb->s_blocksize; + + end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); + if (offset >= end) + return 0; + + if (clone_root && IS_ALIGNED(end, bs)) { + struct btrfs_file_extent_item *ei; + u64 disk_byte; + u64 data_offset; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); + data_offset = btrfs_file_extent_offset(path->nodes[0], ei); + ret = clone_range(sctx, path, clone_root, disk_byte, + data_offset, offset, end - offset); + } else { + ret = send_extent_data(sctx, path, offset, end - offset); + } + sctx->cur_inode_next_write_offset = end; + return ret; +} + +static int is_extent_unchanged(struct send_ctx *sctx, + struct btrfs_path *left_path, + struct btrfs_key *ekey) +{ + int ret = 0; + struct btrfs_key key; + struct btrfs_path *path = NULL; + struct extent_buffer *eb; + int slot; + struct btrfs_key found_key; + struct btrfs_file_extent_item *ei; + u64 left_disknr; + u64 right_disknr; + u64 left_offset; + u64 right_offset; + u64 left_offset_fixed; + u64 left_len; + u64 right_len; + u64 left_gen; + u64 right_gen; + u8 left_type; + u8 right_type; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + eb = left_path->nodes[0]; + slot = left_path->slots[0]; + ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + left_type = btrfs_file_extent_type(eb, ei); + + if (left_type != BTRFS_FILE_EXTENT_REG) { + ret = 0; + goto out; + } + left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); + left_len = btrfs_file_extent_num_bytes(eb, ei); + left_offset = btrfs_file_extent_offset(eb, ei); + left_gen = btrfs_file_extent_generation(eb, ei); + + /* + * Following comments will refer to these graphics. L is the left + * extents which we are checking at the moment. 1-8 are the right + * extents that we iterate. + * + * |-----L-----| + * |-1-|-2a-|-3-|-4-|-5-|-6-| + * + * |-----L-----| + * |--1--|-2b-|...(same as above) + * + * Alternative situation. Happens on files where extents got split. + * |-----L-----| + * |-----------7-----------|-6-| + * + * Alternative situation. Happens on files which got larger. + * |-----L-----| + * |-8-| + * Nothing follows after 8. + */ + + key.objectid = ekey->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = ekey->offset; + ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out; + } + + /* + * Handle special case where the right side has no extents at all. + */ + eb = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + if (found_key.objectid != key.objectid || + found_key.type != key.type) { + /* If we're a hole then just pretend nothing changed */ + ret = (left_disknr) ? 0 : 1; + goto out; + } + + /* + * We're now on 2a, 2b or 7. + */ + key = found_key; + while (key.offset < ekey->offset + left_len) { + ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + right_type = btrfs_file_extent_type(eb, ei); + if (right_type != BTRFS_FILE_EXTENT_REG && + right_type != BTRFS_FILE_EXTENT_INLINE) { + ret = 0; + goto out; + } + + if (right_type == BTRFS_FILE_EXTENT_INLINE) { + right_len = btrfs_file_extent_ram_bytes(eb, ei); + right_len = PAGE_ALIGN(right_len); + } else { + right_len = btrfs_file_extent_num_bytes(eb, ei); + } + + /* + * Are we at extent 8? If yes, we know the extent is changed. + * This may only happen on the first iteration. + */ + if (found_key.offset + right_len <= ekey->offset) { + /* If we're a hole just pretend nothing changed */ + ret = (left_disknr) ? 0 : 1; + goto out; + } + + /* + * We just wanted to see if when we have an inline extent, what + * follows it is a regular extent (wanted to check the above + * condition for inline extents too). This should normally not + * happen but it's possible for example when we have an inline + * compressed extent representing data with a size matching + * the page size (currently the same as sector size). + */ + if (right_type == BTRFS_FILE_EXTENT_INLINE) { + ret = 0; + goto out; + } + + right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); + right_offset = btrfs_file_extent_offset(eb, ei); + right_gen = btrfs_file_extent_generation(eb, ei); + + left_offset_fixed = left_offset; + if (key.offset < ekey->offset) { + /* Fix the right offset for 2a and 7. */ + right_offset += ekey->offset - key.offset; + } else { + /* Fix the left offset for all behind 2a and 2b */ + left_offset_fixed += key.offset - ekey->offset; + } + + /* + * Check if we have the same extent. + */ + if (left_disknr != right_disknr || + left_offset_fixed != right_offset || + left_gen != right_gen) { + ret = 0; + goto out; + } + + /* + * Go to the next extent. + */ + ret = btrfs_next_item(sctx->parent_root, path); + if (ret < 0) + goto out; + if (!ret) { + eb = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + } + if (ret || found_key.objectid != key.objectid || + found_key.type != key.type) { + key.offset += right_len; + break; + } + if (found_key.offset != key.offset + right_len) { + ret = 0; + goto out; + } + key = found_key; + } + + /* + * We're now behind the left extent (treat as unchanged) or at the end + * of the right side (treat as changed). + */ + if (key.offset >= ekey->offset + left_len) + ret = 1; + else + ret = 0; + + +out: + btrfs_free_path(path); + return ret; +} + +static int get_last_extent(struct send_ctx *sctx, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_root *root = sctx->send_root; + struct btrfs_key key; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + sctx->cur_inode_last_extent = 0; + + key.objectid = sctx->cur_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = offset; + ret = btrfs_search_slot_for_read(root, &key, path, 0, 1); + if (ret < 0) + goto out; + ret = 0; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) + goto out; + + sctx->cur_inode_last_extent = btrfs_file_extent_end(path); +out: + btrfs_free_path(path); + return ret; +} + +static int range_is_hole_in_parent(struct send_ctx *sctx, + const u64 start, + const u64 end) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_root *root = sctx->parent_root; + u64 search_start = start; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = sctx->cur_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = search_start; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0 && path->slots[0] > 0) + path->slots[0]--; + + while (search_start < end) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_file_extent_item *fi; + u64 extent_end; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid < sctx->cur_ino || + key.type < BTRFS_EXTENT_DATA_KEY) + goto next; + if (key.objectid > sctx->cur_ino || + key.type > BTRFS_EXTENT_DATA_KEY || + key.offset >= end) + break; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + extent_end = btrfs_file_extent_end(path); + if (extent_end <= start) + goto next; + if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) { + search_start = extent_end; + goto next; + } + ret = 0; + goto out; +next: + path->slots[0]++; + } + ret = 1; +out: + btrfs_free_path(path); + return ret; +} + +static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, + struct btrfs_key *key) +{ + int ret = 0; + + if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) + return 0; + + if (sctx->cur_inode_last_extent == (u64)-1) { + ret = get_last_extent(sctx, key->offset - 1); + if (ret) + return ret; + } + + if (path->slots[0] == 0 && + sctx->cur_inode_last_extent < key->offset) { + /* + * We might have skipped entire leafs that contained only + * file extent items for our current inode. These leafs have + * a generation number smaller (older) than the one in the + * current leaf and the leaf our last extent came from, and + * are located between these 2 leafs. + */ + ret = get_last_extent(sctx, key->offset - 1); + if (ret) + return ret; + } + + if (sctx->cur_inode_last_extent < key->offset) { + ret = range_is_hole_in_parent(sctx, + sctx->cur_inode_last_extent, + key->offset); + if (ret < 0) + return ret; + else if (ret == 0) + ret = send_hole(sctx, key->offset); + else + ret = 0; + } + sctx->cur_inode_last_extent = btrfs_file_extent_end(path); + return ret; +} + +static int process_extent(struct send_ctx *sctx, + struct btrfs_path *path, + struct btrfs_key *key) +{ + struct clone_root *found_clone = NULL; + int ret = 0; + + if (S_ISLNK(sctx->cur_inode_mode)) + return 0; + + if (sctx->parent_root && !sctx->cur_inode_new) { + ret = is_extent_unchanged(sctx, path, key); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out_hole; + } + } else { + struct btrfs_file_extent_item *ei; + u8 type; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], ei); + if (type == BTRFS_FILE_EXTENT_PREALLOC || + type == BTRFS_FILE_EXTENT_REG) { + /* + * The send spec does not have a prealloc command yet, + * so just leave a hole for prealloc'ed extents until + * we have enough commands queued up to justify rev'ing + * the send spec. + */ + if (type == BTRFS_FILE_EXTENT_PREALLOC) { + ret = 0; + goto out; + } + + /* Have a hole, just skip it. */ + if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) { + ret = 0; + goto out; + } + } + } + + ret = find_extent_clone(sctx, path, key->objectid, key->offset, + sctx->cur_inode_size, &found_clone); + if (ret != -ENOENT && ret < 0) + goto out; + + ret = send_write_or_clone(sctx, path, key, found_clone); + if (ret) + goto out; +out_hole: + ret = maybe_send_hole(sctx, path, key); +out: + return ret; +} + +static int process_all_extents(struct send_ctx *sctx) +{ + int ret = 0; + int iter_ret = 0; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + + root = sctx->send_root; + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = sctx->cmp_key->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { + if (found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + break; + } + + ret = process_extent(sctx, path, &found_key); + if (ret < 0) + break; + } + /* Catch error found during iteration */ + if (iter_ret < 0) + ret = iter_ret; + + btrfs_free_path(path); + return ret; +} + +static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end, + int *pending_move, + int *refs_processed) +{ + int ret = 0; + + if (sctx->cur_ino == 0) + goto out; + if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid && + sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY) + goto out; + if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) + goto out; + + ret = process_recorded_refs(sctx, pending_move); + if (ret < 0) + goto out; + + *refs_processed = 1; +out: + return ret; +} + +static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) +{ + int ret = 0; + struct btrfs_inode_info info; + u64 left_mode; + u64 left_uid; + u64 left_gid; + u64 left_fileattr; + u64 right_mode; + u64 right_uid; + u64 right_gid; + u64 right_fileattr; + int need_chmod = 0; + int need_chown = 0; + bool need_fileattr = false; + int need_truncate = 1; + int pending_move = 0; + int refs_processed = 0; + + if (sctx->ignore_cur_inode) + return 0; + + ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move, + &refs_processed); + if (ret < 0) + goto out; + + /* + * We have processed the refs and thus need to advance send_progress. + * Now, calls to get_cur_xxx will take the updated refs of the current + * inode into account. + * + * On the other hand, if our current inode is a directory and couldn't + * be moved/renamed because its parent was renamed/moved too and it has + * a higher inode number, we can only move/rename our current inode + * after we moved/renamed its parent. Therefore in this case operate on + * the old path (pre move/rename) of our current inode, and the + * move/rename will be performed later. + */ + if (refs_processed && !pending_move) + sctx->send_progress = sctx->cur_ino + 1; + + if (sctx->cur_ino == 0 || sctx->cur_inode_deleted) + goto out; + if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino) + goto out; + ret = get_inode_info(sctx->send_root, sctx->cur_ino, &info); + if (ret < 0) + goto out; + left_mode = info.mode; + left_uid = info.uid; + left_gid = info.gid; + left_fileattr = info.fileattr; + + if (!sctx->parent_root || sctx->cur_inode_new) { + need_chown = 1; + if (!S_ISLNK(sctx->cur_inode_mode)) + need_chmod = 1; + if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size) + need_truncate = 0; + } else { + u64 old_size; + + ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &info); + if (ret < 0) + goto out; + old_size = info.size; + right_mode = info.mode; + right_uid = info.uid; + right_gid = info.gid; + right_fileattr = info.fileattr; + + if (left_uid != right_uid || left_gid != right_gid) + need_chown = 1; + if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode) + need_chmod = 1; + if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr) + need_fileattr = true; + if ((old_size == sctx->cur_inode_size) || + (sctx->cur_inode_size > old_size && + sctx->cur_inode_next_write_offset == sctx->cur_inode_size)) + need_truncate = 0; + } + + if (S_ISREG(sctx->cur_inode_mode)) { + if (need_send_hole(sctx)) { + if (sctx->cur_inode_last_extent == (u64)-1 || + sctx->cur_inode_last_extent < + sctx->cur_inode_size) { + ret = get_last_extent(sctx, (u64)-1); + if (ret) + goto out; + } + if (sctx->cur_inode_last_extent < + sctx->cur_inode_size) { + ret = send_hole(sctx, sctx->cur_inode_size); + if (ret) + goto out; + } + } + if (need_truncate) { + ret = send_truncate(sctx, sctx->cur_ino, + sctx->cur_inode_gen, + sctx->cur_inode_size); + if (ret < 0) + goto out; + } + } + + if (need_chown) { + ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen, + left_uid, left_gid); + if (ret < 0) + goto out; + } + if (need_chmod) { + ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen, + left_mode); + if (ret < 0) + goto out; + } + if (need_fileattr) { + ret = send_fileattr(sctx, sctx->cur_ino, sctx->cur_inode_gen, + left_fileattr); + if (ret < 0) + goto out; + } + + if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY) + && sctx->cur_inode_needs_verity) { + ret = process_verity(sctx); + if (ret < 0) + goto out; + } + + ret = send_capabilities(sctx); + if (ret < 0) + goto out; + + /* + * If other directory inodes depended on our current directory + * inode's move/rename, now do their move/rename operations. + */ + if (!is_waiting_for_move(sctx, sctx->cur_ino)) { + ret = apply_children_dir_moves(sctx); + if (ret) + goto out; + /* + * Need to send that every time, no matter if it actually + * changed between the two trees as we have done changes to + * the inode before. If our inode is a directory and it's + * waiting to be moved/renamed, we will send its utimes when + * it's moved/renamed, therefore we don't need to do it here. + */ + sctx->send_progress = sctx->cur_ino + 1; + + /* + * If the current inode is a non-empty directory, delay issuing + * the utimes command for it, as it's very likely we have inodes + * with an higher number inside it. We want to issue the utimes + * command only after adding all dentries to it. + */ + if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0) + ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + else + ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + + if (ret < 0) + goto out; + } + +out: + if (!ret) + ret = trim_dir_utimes_cache(sctx); + + return ret; +} + +static void close_current_inode(struct send_ctx *sctx) +{ + u64 i_size; + + if (sctx->cur_inode == NULL) + return; + + i_size = i_size_read(sctx->cur_inode); + + /* + * If we are doing an incremental send, we may have extents between the + * last processed extent and the i_size that have not been processed + * because they haven't changed but we may have read some of their pages + * through readahead, see the comments at send_extent_data(). + */ + if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size) + truncate_inode_pages_range(&sctx->cur_inode->i_data, + sctx->page_cache_clear_start, + round_up(i_size, PAGE_SIZE) - 1); + + iput(sctx->cur_inode); + sctx->cur_inode = NULL; +} + +static int changed_inode(struct send_ctx *sctx, + enum btrfs_compare_tree_result result) +{ + int ret = 0; + struct btrfs_key *key = sctx->cmp_key; + struct btrfs_inode_item *left_ii = NULL; + struct btrfs_inode_item *right_ii = NULL; + u64 left_gen = 0; + u64 right_gen = 0; + + close_current_inode(sctx); + + sctx->cur_ino = key->objectid; + sctx->cur_inode_new_gen = false; + sctx->cur_inode_last_extent = (u64)-1; + sctx->cur_inode_next_write_offset = 0; + sctx->ignore_cur_inode = false; + + /* + * Set send_progress to current inode. This will tell all get_cur_xxx + * functions that the current inode's refs are not updated yet. Later, + * when process_recorded_refs is finished, it is set to cur_ino + 1. + */ + sctx->send_progress = sctx->cur_ino; + + if (result == BTRFS_COMPARE_TREE_NEW || + result == BTRFS_COMPARE_TREE_CHANGED) { + left_ii = btrfs_item_ptr(sctx->left_path->nodes[0], + sctx->left_path->slots[0], + struct btrfs_inode_item); + left_gen = btrfs_inode_generation(sctx->left_path->nodes[0], + left_ii); + } else { + right_ii = btrfs_item_ptr(sctx->right_path->nodes[0], + sctx->right_path->slots[0], + struct btrfs_inode_item); + right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], + right_ii); + } + if (result == BTRFS_COMPARE_TREE_CHANGED) { + right_ii = btrfs_item_ptr(sctx->right_path->nodes[0], + sctx->right_path->slots[0], + struct btrfs_inode_item); + + right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], + right_ii); + + /* + * The cur_ino = root dir case is special here. We can't treat + * the inode as deleted+reused because it would generate a + * stream that tries to delete/mkdir the root dir. + */ + if (left_gen != right_gen && + sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) + sctx->cur_inode_new_gen = true; + } + + /* + * Normally we do not find inodes with a link count of zero (orphans) + * because the most common case is to create a snapshot and use it + * for a send operation. However other less common use cases involve + * using a subvolume and send it after turning it to RO mode just + * after deleting all hard links of a file while holding an open + * file descriptor against it or turning a RO snapshot into RW mode, + * keep an open file descriptor against a file, delete it and then + * turn the snapshot back to RO mode before using it for a send + * operation. The former is what the receiver operation does. + * Therefore, if we want to send these snapshots soon after they're + * received, we need to handle orphan inodes as well. Moreover, orphans + * can appear not only in the send snapshot but also in the parent + * snapshot. Here are several cases: + * + * Case 1: BTRFS_COMPARE_TREE_NEW + * | send snapshot | action + * -------------------------------- + * nlink | 0 | ignore + * + * Case 2: BTRFS_COMPARE_TREE_DELETED + * | parent snapshot | action + * ---------------------------------- + * nlink | 0 | as usual + * Note: No unlinks will be sent because there're no paths for it. + * + * Case 3: BTRFS_COMPARE_TREE_CHANGED + * | | parent snapshot | send snapshot | action + * ----------------------------------------------------------------------- + * subcase 1 | nlink | 0 | 0 | ignore + * subcase 2 | nlink | >0 | 0 | new_gen(deletion) + * subcase 3 | nlink | 0 | >0 | new_gen(creation) + * + */ + if (result == BTRFS_COMPARE_TREE_NEW) { + if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) { + sctx->ignore_cur_inode = true; + goto out; + } + sctx->cur_inode_gen = left_gen; + sctx->cur_inode_new = true; + sctx->cur_inode_deleted = false; + sctx->cur_inode_size = btrfs_inode_size( + sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_rdev = btrfs_inode_rdev( + sctx->left_path->nodes[0], left_ii); + if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) + ret = send_create_inode_if_needed(sctx); + } else if (result == BTRFS_COMPARE_TREE_DELETED) { + sctx->cur_inode_gen = right_gen; + sctx->cur_inode_new = false; + sctx->cur_inode_deleted = true; + sctx->cur_inode_size = btrfs_inode_size( + sctx->right_path->nodes[0], right_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->right_path->nodes[0], right_ii); + } else if (result == BTRFS_COMPARE_TREE_CHANGED) { + u32 new_nlinks, old_nlinks; + + new_nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii); + old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii); + if (new_nlinks == 0 && old_nlinks == 0) { + sctx->ignore_cur_inode = true; + goto out; + } else if (new_nlinks == 0 || old_nlinks == 0) { + sctx->cur_inode_new_gen = 1; + } + /* + * We need to do some special handling in case the inode was + * reported as changed with a changed generation number. This + * means that the original inode was deleted and new inode + * reused the same inum. So we have to treat the old inode as + * deleted and the new one as new. + */ + if (sctx->cur_inode_new_gen) { + /* + * First, process the inode as if it was deleted. + */ + if (old_nlinks > 0) { + sctx->cur_inode_gen = right_gen; + sctx->cur_inode_new = false; + sctx->cur_inode_deleted = true; + sctx->cur_inode_size = btrfs_inode_size( + sctx->right_path->nodes[0], right_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->right_path->nodes[0], right_ii); + ret = process_all_refs(sctx, + BTRFS_COMPARE_TREE_DELETED); + if (ret < 0) + goto out; + } + + /* + * Now process the inode as if it was new. + */ + if (new_nlinks > 0) { + sctx->cur_inode_gen = left_gen; + sctx->cur_inode_new = true; + sctx->cur_inode_deleted = false; + sctx->cur_inode_size = btrfs_inode_size( + sctx->left_path->nodes[0], + left_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->left_path->nodes[0], + left_ii); + sctx->cur_inode_rdev = btrfs_inode_rdev( + sctx->left_path->nodes[0], + left_ii); + ret = send_create_inode_if_needed(sctx); + if (ret < 0) + goto out; + + ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); + if (ret < 0) + goto out; + /* + * Advance send_progress now as we did not get + * into process_recorded_refs_if_needed in the + * new_gen case. + */ + sctx->send_progress = sctx->cur_ino + 1; + + /* + * Now process all extents and xattrs of the + * inode as if they were all new. + */ + ret = process_all_extents(sctx); + if (ret < 0) + goto out; + ret = process_all_new_xattrs(sctx); + if (ret < 0) + goto out; + } + } else { + sctx->cur_inode_gen = left_gen; + sctx->cur_inode_new = false; + sctx->cur_inode_new_gen = false; + sctx->cur_inode_deleted = false; + sctx->cur_inode_size = btrfs_inode_size( + sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->left_path->nodes[0], left_ii); + } + } + +out: + return ret; +} + +/* + * We have to process new refs before deleted refs, but compare_trees gives us + * the new and deleted refs mixed. To fix this, we record the new/deleted refs + * first and later process them in process_recorded_refs. + * For the cur_inode_new_gen case, we skip recording completely because + * changed_inode did already initiate processing of refs. The reason for this is + * that in this case, compare_tree actually compares the refs of 2 different + * inodes. To fix this, process_all_refs is used in changed_inode to handle all + * refs of the right tree as deleted and all refs of the left tree as new. + */ +static int changed_ref(struct send_ctx *sctx, + enum btrfs_compare_tree_result result) +{ + int ret = 0; + + if (sctx->cur_ino != sctx->cmp_key->objectid) { + inconsistent_snapshot_error(sctx, result, "reference"); + return -EIO; + } + + if (!sctx->cur_inode_new_gen && + sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) { + if (result == BTRFS_COMPARE_TREE_NEW) + ret = record_new_ref(sctx); + else if (result == BTRFS_COMPARE_TREE_DELETED) + ret = record_deleted_ref(sctx); + else if (result == BTRFS_COMPARE_TREE_CHANGED) + ret = record_changed_ref(sctx); + } + + return ret; +} + +/* + * Process new/deleted/changed xattrs. We skip processing in the + * cur_inode_new_gen case because changed_inode did already initiate processing + * of xattrs. The reason is the same as in changed_ref + */ +static int changed_xattr(struct send_ctx *sctx, + enum btrfs_compare_tree_result result) +{ + int ret = 0; + + if (sctx->cur_ino != sctx->cmp_key->objectid) { + inconsistent_snapshot_error(sctx, result, "xattr"); + return -EIO; + } + + if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { + if (result == BTRFS_COMPARE_TREE_NEW) + ret = process_new_xattr(sctx); + else if (result == BTRFS_COMPARE_TREE_DELETED) + ret = process_deleted_xattr(sctx); + else if (result == BTRFS_COMPARE_TREE_CHANGED) + ret = process_changed_xattr(sctx); + } + + return ret; +} + +/* + * Process new/deleted/changed extents. We skip processing in the + * cur_inode_new_gen case because changed_inode did already initiate processing + * of extents. The reason is the same as in changed_ref + */ +static int changed_extent(struct send_ctx *sctx, + enum btrfs_compare_tree_result result) +{ + int ret = 0; + + /* + * We have found an extent item that changed without the inode item + * having changed. This can happen either after relocation (where the + * disk_bytenr of an extent item is replaced at + * relocation.c:replace_file_extents()) or after deduplication into a + * file in both the parent and send snapshots (where an extent item can + * get modified or replaced with a new one). Note that deduplication + * updates the inode item, but it only changes the iversion (sequence + * field in the inode item) of the inode, so if a file is deduplicated + * the same amount of times in both the parent and send snapshots, its + * iversion becomes the same in both snapshots, whence the inode item is + * the same on both snapshots. + */ + if (sctx->cur_ino != sctx->cmp_key->objectid) + return 0; + + if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { + if (result != BTRFS_COMPARE_TREE_DELETED) + ret = process_extent(sctx, sctx->left_path, + sctx->cmp_key); + } + + return ret; +} + +static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result) +{ + int ret = 0; + + if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { + if (result == BTRFS_COMPARE_TREE_NEW) + sctx->cur_inode_needs_verity = true; + } + return ret; +} + +static int dir_changed(struct send_ctx *sctx, u64 dir) +{ + u64 orig_gen, new_gen; + int ret; + + ret = get_inode_gen(sctx->send_root, dir, &new_gen); + if (ret) + return ret; + + ret = get_inode_gen(sctx->parent_root, dir, &orig_gen); + if (ret) + return ret; + + return (orig_gen != new_gen) ? 1 : 0; +} + +static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, + struct btrfs_key *key) +{ + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + u64 dirid = 0, last_dirid = 0; + unsigned long ptr; + u32 item_size; + u32 cur_offset = 0; + int ref_name_len; + int ret = 0; + + /* Easy case, just check this one dirid */ + if (key->type == BTRFS_INODE_REF_KEY) { + dirid = key->offset; + + ret = dir_changed(sctx, dirid); + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *)(ptr + + cur_offset); + dirid = btrfs_inode_extref_parent(leaf, extref); + ref_name_len = btrfs_inode_extref_name_len(leaf, extref); + cur_offset += ref_name_len + sizeof(*extref); + if (dirid == last_dirid) + continue; + ret = dir_changed(sctx, dirid); + if (ret) + break; + last_dirid = dirid; + } +out: + return ret; +} + +/* + * Updates compare related fields in sctx and simply forwards to the actual + * changed_xxx functions. + */ +static int changed_cb(struct btrfs_path *left_path, + struct btrfs_path *right_path, + struct btrfs_key *key, + enum btrfs_compare_tree_result result, + struct send_ctx *sctx) +{ + int ret = 0; + + /* + * We can not hold the commit root semaphore here. This is because in + * the case of sending and receiving to the same filesystem, using a + * pipe, could result in a deadlock: + * + * 1) The task running send blocks on the pipe because it's full; + * + * 2) The task running receive, which is the only consumer of the pipe, + * is waiting for a transaction commit (for example due to a space + * reservation when doing a write or triggering a transaction commit + * when creating a subvolume); + * + * 3) The transaction is waiting to write lock the commit root semaphore, + * but can not acquire it since it's being held at 1). + * + * Down this call chain we write to the pipe through kernel_write(). + * The same type of problem can also happen when sending to a file that + * is stored in the same filesystem - when reserving space for a write + * into the file, we can trigger a transaction commit. + * + * Our caller has supplied us with clones of leaves from the send and + * parent roots, so we're safe here from a concurrent relocation and + * further reallocation of metadata extents while we are here. Below we + * also assert that the leaves are clones. + */ + lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem); + + /* + * We always have a send root, so left_path is never NULL. We will not + * have a leaf when we have reached the end of the send root but have + * not yet reached the end of the parent root. + */ + if (left_path->nodes[0]) + ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, + &left_path->nodes[0]->bflags)); + /* + * When doing a full send we don't have a parent root, so right_path is + * NULL. When doing an incremental send, we may have reached the end of + * the parent root already, so we don't have a leaf at right_path. + */ + if (right_path && right_path->nodes[0]) + ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, + &right_path->nodes[0]->bflags)); + + if (result == BTRFS_COMPARE_TREE_SAME) { + if (key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_INODE_EXTREF_KEY) { + ret = compare_refs(sctx, left_path, key); + if (!ret) + return 0; + if (ret < 0) + return ret; + } else if (key->type == BTRFS_EXTENT_DATA_KEY) { + return maybe_send_hole(sctx, left_path, key); + } else { + return 0; + } + result = BTRFS_COMPARE_TREE_CHANGED; + ret = 0; + } + + sctx->left_path = left_path; + sctx->right_path = right_path; + sctx->cmp_key = key; + + ret = finish_inode_if_needed(sctx, 0); + if (ret < 0) + goto out; + + /* Ignore non-FS objects */ + if (key->objectid == BTRFS_FREE_INO_OBJECTID || + key->objectid == BTRFS_FREE_SPACE_OBJECTID) + goto out; + + if (key->type == BTRFS_INODE_ITEM_KEY) { + ret = changed_inode(sctx, result); + } else if (!sctx->ignore_cur_inode) { + if (key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_INODE_EXTREF_KEY) + ret = changed_ref(sctx, result); + else if (key->type == BTRFS_XATTR_ITEM_KEY) + ret = changed_xattr(sctx, result); + else if (key->type == BTRFS_EXTENT_DATA_KEY) + ret = changed_extent(sctx, result); + else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY && + key->offset == 0) + ret = changed_verity(sctx, result); + } + +out: + return ret; +} + +static int search_key_again(const struct send_ctx *sctx, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *key) +{ + int ret; + + if (!path->need_commit_sem) + lockdep_assert_held_read(&root->fs_info->commit_root_sem); + + /* + * Roots used for send operations are readonly and no one can add, + * update or remove keys from them, so we should be able to find our + * key again. The only exception is deduplication, which can operate on + * readonly roots and add, update or remove keys to/from them - but at + * the moment we don't allow it to run in parallel with send. + */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + ASSERT(ret <= 0); + if (ret > 0) { + btrfs_print_tree(path->nodes[path->lowest_level], false); + btrfs_err(root->fs_info, +"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", + key->objectid, key->type, key->offset, + (root == sctx->parent_root ? "parent" : "send"), + root->root_key.objectid, path->lowest_level, + path->slots[path->lowest_level]); + return -EUCLEAN; + } + + return ret; +} + +static int full_send_tree(struct send_ctx *sctx) +{ + int ret; + struct btrfs_root *send_root = sctx->send_root; + struct btrfs_key key; + struct btrfs_fs_info *fs_info = send_root->fs_info; + struct btrfs_path *path; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + path->reada = READA_FORWARD_ALWAYS; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + down_read(&fs_info->commit_root_sem); + sctx->last_reloc_trans = fs_info->last_reloc_trans; + up_read(&fs_info->commit_root_sem); + + ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); + if (ret < 0) + goto out; + if (ret) + goto out_finish; + + while (1) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + ret = changed_cb(path, NULL, &key, + BTRFS_COMPARE_TREE_NEW, sctx); + if (ret < 0) + goto out; + + down_read(&fs_info->commit_root_sem); + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + sctx->last_reloc_trans = fs_info->last_reloc_trans; + up_read(&fs_info->commit_root_sem); + /* + * A transaction used for relocating a block group was + * committed or is about to finish its commit. Release + * our path (leaf) and restart the search, so that we + * avoid operating on any file extent items that are + * stale, with a disk_bytenr that reflects a pre + * relocation value. This way we avoid as much as + * possible to fallback to regular writes when checking + * if we can clone file ranges. + */ + btrfs_release_path(path); + ret = search_key_again(sctx, send_root, path, &key); + if (ret < 0) + goto out; + } else { + up_read(&fs_info->commit_root_sem); + } + + ret = btrfs_next_item(send_root, path); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + break; + } + } + +out_finish: + ret = finish_inode_if_needed(sctx, 1); + +out: + btrfs_free_path(path); + return ret; +} + +static int replace_node_with_clone(struct btrfs_path *path, int level) +{ + struct extent_buffer *clone; + + clone = btrfs_clone_extent_buffer(path->nodes[level]); + if (!clone) + return -ENOMEM; + + free_extent_buffer(path->nodes[level]); + path->nodes[level] = clone; + + return 0; +} + +static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen) +{ + struct extent_buffer *eb; + struct extent_buffer *parent = path->nodes[*level]; + int slot = path->slots[*level]; + const int nritems = btrfs_header_nritems(parent); + u64 reada_max; + u64 reada_done = 0; + + lockdep_assert_held_read(&parent->fs_info->commit_root_sem); + + BUG_ON(*level == 0); + eb = btrfs_read_node_slot(parent, slot); + if (IS_ERR(eb)) + return PTR_ERR(eb); + + /* + * Trigger readahead for the next leaves we will process, so that it is + * very likely that when we need them they are already in memory and we + * will not block on disk IO. For nodes we only do readahead for one, + * since the time window between processing nodes is typically larger. + */ + reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize); + + for (slot++; slot < nritems && reada_done < reada_max; slot++) { + if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) { + btrfs_readahead_node_child(parent, slot); + reada_done += eb->fs_info->nodesize; + } + } + + path->nodes[*level - 1] = eb; + path->slots[*level - 1] = 0; + (*level)--; + + if (*level == 0) + return replace_node_with_clone(path, 0); + + return 0; +} + +static int tree_move_next_or_upnext(struct btrfs_path *path, + int *level, int root_level) +{ + int ret = 0; + int nritems; + nritems = btrfs_header_nritems(path->nodes[*level]); + + path->slots[*level]++; + + while (path->slots[*level] >= nritems) { + if (*level == root_level) { + path->slots[*level] = nritems - 1; + return -1; + } + + /* move upnext */ + path->slots[*level] = 0; + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + (*level)++; + path->slots[*level]++; + + nritems = btrfs_header_nritems(path->nodes[*level]); + ret = 1; + } + return ret; +} + +/* + * Returns 1 if it had to move up and next. 0 is returned if it moved only next + * or down. + */ +static int tree_advance(struct btrfs_path *path, + int *level, int root_level, + int allow_down, + struct btrfs_key *key, + u64 reada_min_gen) +{ + int ret; + + if (*level == 0 || !allow_down) { + ret = tree_move_next_or_upnext(path, level, root_level); + } else { + ret = tree_move_down(path, level, reada_min_gen); + } + + /* + * Even if we have reached the end of a tree, ret is -1, update the key + * anyway, so that in case we need to restart due to a block group + * relocation, we can assert that the last key of the root node still + * exists in the tree. + */ + if (*level == 0) + btrfs_item_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + else + btrfs_node_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + + return ret; +} + +static int tree_compare_item(struct btrfs_path *left_path, + struct btrfs_path *right_path, + char *tmp_buf) +{ + int cmp; + int len1, len2; + unsigned long off1, off2; + + len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]); + len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]); + if (len1 != len2) + return 1; + + off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); + off2 = btrfs_item_ptr_offset(right_path->nodes[0], + right_path->slots[0]); + + read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); + + cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); + if (cmp) + return 1; + return 0; +} + +/* + * A transaction used for relocating a block group was committed or is about to + * finish its commit. Release our paths and restart the search, so that we are + * not using stale extent buffers: + * + * 1) For levels > 0, we are only holding references of extent buffers, without + * any locks on them, which does not prevent them from having been relocated + * and reallocated after the last time we released the commit root semaphore. + * The exception are the root nodes, for which we always have a clone, see + * the comment at btrfs_compare_trees(); + * + * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so + * we are safe from the concurrent relocation and reallocation. However they + * can have file extent items with a pre relocation disk_bytenr value, so we + * restart the start from the current commit roots and clone the new leaves so + * that we get the post relocation disk_bytenr values. Not doing so, could + * make us clone the wrong data in case there are new extents using the old + * disk_bytenr that happen to be shared. + */ +static int restart_after_relocation(struct btrfs_path *left_path, + struct btrfs_path *right_path, + const struct btrfs_key *left_key, + const struct btrfs_key *right_key, + int left_level, + int right_level, + const struct send_ctx *sctx) +{ + int root_level; + int ret; + + lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem); + + btrfs_release_path(left_path); + btrfs_release_path(right_path); + + /* + * Since keys can not be added or removed to/from our roots because they + * are readonly and we do not allow deduplication to run in parallel + * (which can add, remove or change keys), the layout of the trees should + * not change. + */ + left_path->lowest_level = left_level; + ret = search_key_again(sctx, sctx->send_root, left_path, left_key); + if (ret < 0) + return ret; + + right_path->lowest_level = right_level; + ret = search_key_again(sctx, sctx->parent_root, right_path, right_key); + if (ret < 0) + return ret; + + /* + * If the lowest level nodes are leaves, clone them so that they can be + * safely used by changed_cb() while not under the protection of the + * commit root semaphore, even if relocation and reallocation happens in + * parallel. + */ + if (left_level == 0) { + ret = replace_node_with_clone(left_path, 0); + if (ret < 0) + return ret; + } + + if (right_level == 0) { + ret = replace_node_with_clone(right_path, 0); + if (ret < 0) + return ret; + } + + /* + * Now clone the root nodes (unless they happen to be the leaves we have + * already cloned). This is to protect against concurrent snapshotting of + * the send and parent roots (see the comment at btrfs_compare_trees()). + */ + root_level = btrfs_header_level(sctx->send_root->commit_root); + if (root_level > 0) { + ret = replace_node_with_clone(left_path, root_level); + if (ret < 0) + return ret; + } + + root_level = btrfs_header_level(sctx->parent_root->commit_root); + if (root_level > 0) { + ret = replace_node_with_clone(right_path, root_level); + if (ret < 0) + return ret; + } + + return 0; +} + +/* + * This function compares two trees and calls the provided callback for + * every changed/new/deleted item it finds. + * If shared tree blocks are encountered, whole subtrees are skipped, making + * the compare pretty fast on snapshotted subvolumes. + * + * This currently works on commit roots only. As commit roots are read only, + * we don't do any locking. The commit roots are protected with transactions. + * Transactions are ended and rejoined when a commit is tried in between. + * + * This function checks for modifications done to the trees while comparing. + * If it detects a change, it aborts immediately. + */ +static int btrfs_compare_trees(struct btrfs_root *left_root, + struct btrfs_root *right_root, struct send_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = left_root->fs_info; + int ret; + int cmp; + struct btrfs_path *left_path = NULL; + struct btrfs_path *right_path = NULL; + struct btrfs_key left_key; + struct btrfs_key right_key; + char *tmp_buf = NULL; + int left_root_level; + int right_root_level; + int left_level; + int right_level; + int left_end_reached = 0; + int right_end_reached = 0; + int advance_left = 0; + int advance_right = 0; + u64 left_blockptr; + u64 right_blockptr; + u64 left_gen; + u64 right_gen; + u64 reada_min_gen; + + left_path = btrfs_alloc_path(); + if (!left_path) { + ret = -ENOMEM; + goto out; + } + right_path = btrfs_alloc_path(); + if (!right_path) { + ret = -ENOMEM; + goto out; + } + + tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); + if (!tmp_buf) { + ret = -ENOMEM; + goto out; + } + + left_path->search_commit_root = 1; + left_path->skip_locking = 1; + right_path->search_commit_root = 1; + right_path->skip_locking = 1; + + /* + * Strategy: Go to the first items of both trees. Then do + * + * If both trees are at level 0 + * Compare keys of current items + * If left < right treat left item as new, advance left tree + * and repeat + * If left > right treat right item as deleted, advance right tree + * and repeat + * If left == right do deep compare of items, treat as changed if + * needed, advance both trees and repeat + * If both trees are at the same level but not at level 0 + * Compare keys of current nodes/leafs + * If left < right advance left tree and repeat + * If left > right advance right tree and repeat + * If left == right compare blockptrs of the next nodes/leafs + * If they match advance both trees but stay at the same level + * and repeat + * If they don't match advance both trees while allowing to go + * deeper and repeat + * If tree levels are different + * Advance the tree that needs it and repeat + * + * Advancing a tree means: + * If we are at level 0, try to go to the next slot. If that's not + * possible, go one level up and repeat. Stop when we found a level + * where we could go to the next slot. We may at this point be on a + * node or a leaf. + * + * If we are not at level 0 and not on shared tree blocks, go one + * level deeper. + * + * If we are not at level 0 and on shared tree blocks, go one slot to + * the right if possible or go up and right. + */ + + down_read(&fs_info->commit_root_sem); + left_level = btrfs_header_level(left_root->commit_root); + left_root_level = left_level; + /* + * We clone the root node of the send and parent roots to prevent races + * with snapshot creation of these roots. Snapshot creation COWs the + * root node of a tree, so after the transaction is committed the old + * extent can be reallocated while this send operation is still ongoing. + * So we clone them, under the commit root semaphore, to be race free. + */ + left_path->nodes[left_level] = + btrfs_clone_extent_buffer(left_root->commit_root); + if (!left_path->nodes[left_level]) { + ret = -ENOMEM; + goto out_unlock; + } + + right_level = btrfs_header_level(right_root->commit_root); + right_root_level = right_level; + right_path->nodes[right_level] = + btrfs_clone_extent_buffer(right_root->commit_root); + if (!right_path->nodes[right_level]) { + ret = -ENOMEM; + goto out_unlock; + } + /* + * Our right root is the parent root, while the left root is the "send" + * root. We know that all new nodes/leaves in the left root must have + * a generation greater than the right root's generation, so we trigger + * readahead for those nodes and leaves of the left root, as we know we + * will need to read them at some point. + */ + reada_min_gen = btrfs_header_generation(right_root->commit_root); + + if (left_level == 0) + btrfs_item_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + else + btrfs_node_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + if (right_level == 0) + btrfs_item_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + else + btrfs_node_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + + sctx->last_reloc_trans = fs_info->last_reloc_trans; + + while (1) { + if (need_resched() || + rwsem_is_contended(&fs_info->commit_root_sem)) { + up_read(&fs_info->commit_root_sem); + cond_resched(); + down_read(&fs_info->commit_root_sem); + } + + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + ret = restart_after_relocation(left_path, right_path, + &left_key, &right_key, + left_level, right_level, + sctx); + if (ret < 0) + goto out_unlock; + sctx->last_reloc_trans = fs_info->last_reloc_trans; + } + + if (advance_left && !left_end_reached) { + ret = tree_advance(left_path, &left_level, + left_root_level, + advance_left != ADVANCE_ONLY_NEXT, + &left_key, reada_min_gen); + if (ret == -1) + left_end_reached = ADVANCE; + else if (ret < 0) + goto out_unlock; + advance_left = 0; + } + if (advance_right && !right_end_reached) { + ret = tree_advance(right_path, &right_level, + right_root_level, + advance_right != ADVANCE_ONLY_NEXT, + &right_key, reada_min_gen); + if (ret == -1) + right_end_reached = ADVANCE; + else if (ret < 0) + goto out_unlock; + advance_right = 0; + } + + if (left_end_reached && right_end_reached) { + ret = 0; + goto out_unlock; + } else if (left_end_reached) { + if (right_level == 0) { + up_read(&fs_info->commit_root_sem); + ret = changed_cb(left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + sctx); + if (ret < 0) + goto out; + down_read(&fs_info->commit_root_sem); + } + advance_right = ADVANCE; + continue; + } else if (right_end_reached) { + if (left_level == 0) { + up_read(&fs_info->commit_root_sem); + ret = changed_cb(left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + sctx); + if (ret < 0) + goto out; + down_read(&fs_info->commit_root_sem); + } + advance_left = ADVANCE; + continue; + } + + if (left_level == 0 && right_level == 0) { + up_read(&fs_info->commit_root_sem); + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + ret = changed_cb(left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + sctx); + advance_left = ADVANCE; + } else if (cmp > 0) { + ret = changed_cb(left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + sctx); + advance_right = ADVANCE; + } else { + enum btrfs_compare_tree_result result; + + WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); + ret = tree_compare_item(left_path, right_path, + tmp_buf); + if (ret) + result = BTRFS_COMPARE_TREE_CHANGED; + else + result = BTRFS_COMPARE_TREE_SAME; + ret = changed_cb(left_path, right_path, + &left_key, result, sctx); + advance_left = ADVANCE; + advance_right = ADVANCE; + } + + if (ret < 0) + goto out; + down_read(&fs_info->commit_root_sem); + } else if (left_level == right_level) { + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + advance_left = ADVANCE; + } else if (cmp > 0) { + advance_right = ADVANCE; + } else { + left_blockptr = btrfs_node_blockptr( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_blockptr = btrfs_node_blockptr( + right_path->nodes[right_level], + right_path->slots[right_level]); + left_gen = btrfs_node_ptr_generation( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_gen = btrfs_node_ptr_generation( + right_path->nodes[right_level], + right_path->slots[right_level]); + if (left_blockptr == right_blockptr && + left_gen == right_gen) { + /* + * As we're on a shared block, don't + * allow to go deeper. + */ + advance_left = ADVANCE_ONLY_NEXT; + advance_right = ADVANCE_ONLY_NEXT; + } else { + advance_left = ADVANCE; + advance_right = ADVANCE; + } + } + } else if (left_level < right_level) { + advance_right = ADVANCE; + } else { + advance_left = ADVANCE; + } + } + +out_unlock: + up_read(&fs_info->commit_root_sem); +out: + btrfs_free_path(left_path); + btrfs_free_path(right_path); + kvfree(tmp_buf); + return ret; +} + +static int send_subvol(struct send_ctx *sctx) +{ + int ret; + + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) { + ret = send_header(sctx); + if (ret < 0) + goto out; + } + + ret = send_subvol_begin(sctx); + if (ret < 0) + goto out; + + if (sctx->parent_root) { + ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx); + if (ret < 0) + goto out; + ret = finish_inode_if_needed(sctx, 1); + if (ret < 0) + goto out; + } else { + ret = full_send_tree(sctx); + if (ret < 0) + goto out; + } + +out: + free_recorded_refs(sctx); + return ret; +} + +/* + * If orphan cleanup did remove any orphans from a root, it means the tree + * was modified and therefore the commit root is not the same as the current + * root anymore. This is a problem, because send uses the commit root and + * therefore can see inode items that don't exist in the current root anymore, + * and for example make calls to btrfs_iget, which will do tree lookups based + * on the current root and not on the commit root. Those lookups will fail, + * returning a -ESTALE error, and making send fail with that error. So make + * sure a send does not see any orphans we have just removed, and that it will + * see the same inodes regardless of whether a transaction commit happened + * before it started (meaning that the commit root will be the same as the + * current root) or not. + */ +static int ensure_commit_roots_uptodate(struct send_ctx *sctx) +{ + int i; + struct btrfs_trans_handle *trans = NULL; + +again: + if (sctx->parent_root && + sctx->parent_root->node != sctx->parent_root->commit_root) + goto commit_trans; + + for (i = 0; i < sctx->clone_roots_cnt; i++) + if (sctx->clone_roots[i].root->node != + sctx->clone_roots[i].root->commit_root) + goto commit_trans; + + if (trans) + return btrfs_end_transaction(trans); + + return 0; + +commit_trans: + /* Use any root, all fs roots will get their commit roots updated. */ + if (!trans) { + trans = btrfs_join_transaction(sctx->send_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + goto again; + } + + return btrfs_commit_transaction(trans); +} + +/* + * Make sure any existing dellaloc is flushed for any root used by a send + * operation so that we do not miss any data and we do not race with writeback + * finishing and changing a tree while send is using the tree. This could + * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and + * a send operation then uses the subvolume. + * After flushing delalloc ensure_commit_roots_uptodate() must be called. + */ +static int flush_delalloc_roots(struct send_ctx *sctx) +{ + struct btrfs_root *root = sctx->parent_root; + int ret; + int i; + + if (root) { + ret = btrfs_start_delalloc_snapshot(root, false); + if (ret) + return ret; + btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); + } + + for (i = 0; i < sctx->clone_roots_cnt; i++) { + root = sctx->clone_roots[i].root; + ret = btrfs_start_delalloc_snapshot(root, false); + if (ret) + return ret; + btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); + } + + return 0; +} + +static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) +{ + spin_lock(&root->root_item_lock); + root->send_in_progress--; + /* + * Not much left to do, we don't know why it's unbalanced and + * can't blindly reset it to 0. + */ + if (root->send_in_progress < 0) + btrfs_err(root->fs_info, + "send_in_progress unbalanced %d root %llu", + root->send_in_progress, root->root_key.objectid); + spin_unlock(&root->root_item_lock); +} + +static void dedupe_in_progress_warn(const struct btrfs_root *root) +{ + btrfs_warn_rl(root->fs_info, +"cannot use root %llu for send while deduplications on it are in progress (%d in progress)", + root->root_key.objectid, root->dedupe_in_progress); +} + +long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) +{ + int ret = 0; + struct btrfs_root *send_root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = send_root->fs_info; + struct btrfs_root *clone_root; + struct send_ctx *sctx = NULL; + u32 i; + u64 *clone_sources_tmp = NULL; + int clone_sources_to_rollback = 0; + size_t alloc_size; + int sort_clone_roots = 0; + struct btrfs_lru_cache_entry *entry; + struct btrfs_lru_cache_entry *tmp; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * The subvolume must remain read-only during send, protect against + * making it RW. This also protects against deletion. + */ + spin_lock(&send_root->root_item_lock); + if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) { + dedupe_in_progress_warn(send_root); + spin_unlock(&send_root->root_item_lock); + return -EAGAIN; + } + send_root->send_in_progress++; + spin_unlock(&send_root->root_item_lock); + + /* + * Userspace tools do the checks and warn the user if it's + * not RO. + */ + if (!btrfs_root_readonly(send_root)) { + ret = -EPERM; + goto out; + } + + /* + * Check that we don't overflow at later allocations, we request + * clone_sources_count + 1 items, and compare to unsigned long inside + * access_ok. Also set an upper limit for allocation size so this can't + * easily exhaust memory. Max number of clone sources is about 200K. + */ + if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) { + ret = -EINVAL; + goto out; + } + + if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { + ret = -EINVAL; + goto out; + } + + sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL); + if (!sctx) { + ret = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&sctx->new_refs); + INIT_LIST_HEAD(&sctx->deleted_refs); + + btrfs_lru_cache_init(&sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE); + btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE); + btrfs_lru_cache_init(&sctx->dir_created_cache, + SEND_MAX_DIR_CREATED_CACHE_SIZE); + /* + * This cache is periodically trimmed to a fixed size elsewhere, see + * cache_dir_utimes() and trim_dir_utimes_cache(). + */ + btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0); + + sctx->pending_dir_moves = RB_ROOT; + sctx->waiting_dir_moves = RB_ROOT; + sctx->orphan_dirs = RB_ROOT; + sctx->rbtree_new_refs = RB_ROOT; + sctx->rbtree_deleted_refs = RB_ROOT; + + sctx->flags = arg->flags; + + if (arg->flags & BTRFS_SEND_FLAG_VERSION) { + if (arg->version > BTRFS_SEND_STREAM_VERSION) { + ret = -EPROTO; + goto out; + } + /* Zero means "use the highest version" */ + sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION; + } else { + sctx->proto = 1; + } + if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) { + ret = -EINVAL; + goto out; + } + + sctx->send_filp = fget(arg->send_fd); + if (!sctx->send_filp || !(sctx->send_filp->f_mode & FMODE_WRITE)) { + ret = -EBADF; + goto out; + } + + sctx->send_root = send_root; + /* + * Unlikely but possible, if the subvolume is marked for deletion but + * is slow to remove the directory entry, send can still be started + */ + if (btrfs_root_dead(sctx->send_root)) { + ret = -EPERM; + goto out; + } + + sctx->clone_roots_cnt = arg->clone_sources_count; + + if (sctx->proto >= 2) { + u32 send_buf_num_pages; + + sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2; + sctx->send_buf = vmalloc(sctx->send_max_size); + if (!sctx->send_buf) { + ret = -ENOMEM; + goto out; + } + send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT; + sctx->send_buf_pages = kcalloc(send_buf_num_pages, + sizeof(*sctx->send_buf_pages), + GFP_KERNEL); + if (!sctx->send_buf_pages) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < send_buf_num_pages; i++) { + sctx->send_buf_pages[i] = + vmalloc_to_page(sctx->send_buf + (i << PAGE_SHIFT)); + } + } else { + sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1; + sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); + } + if (!sctx->send_buf) { + ret = -ENOMEM; + goto out; + } + + sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), + arg->clone_sources_count + 1, + GFP_KERNEL); + if (!sctx->clone_roots) { + ret = -ENOMEM; + goto out; + } + + alloc_size = array_size(sizeof(*arg->clone_sources), + arg->clone_sources_count); + + if (arg->clone_sources_count) { + clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL); + if (!clone_sources_tmp) { + ret = -ENOMEM; + goto out; + } + + ret = copy_from_user(clone_sources_tmp, arg->clone_sources, + alloc_size); + if (ret) { + ret = -EFAULT; + goto out; + } + + for (i = 0; i < arg->clone_sources_count; i++) { + clone_root = btrfs_get_fs_root(fs_info, + clone_sources_tmp[i], true); + if (IS_ERR(clone_root)) { + ret = PTR_ERR(clone_root); + goto out; + } + spin_lock(&clone_root->root_item_lock); + if (!btrfs_root_readonly(clone_root) || + btrfs_root_dead(clone_root)) { + spin_unlock(&clone_root->root_item_lock); + btrfs_put_root(clone_root); + ret = -EPERM; + goto out; + } + if (clone_root->dedupe_in_progress) { + dedupe_in_progress_warn(clone_root); + spin_unlock(&clone_root->root_item_lock); + btrfs_put_root(clone_root); + ret = -EAGAIN; + goto out; + } + clone_root->send_in_progress++; + spin_unlock(&clone_root->root_item_lock); + + sctx->clone_roots[i].root = clone_root; + clone_sources_to_rollback = i + 1; + } + kvfree(clone_sources_tmp); + clone_sources_tmp = NULL; + } + + if (arg->parent_root) { + sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root, + true); + if (IS_ERR(sctx->parent_root)) { + ret = PTR_ERR(sctx->parent_root); + goto out; + } + + spin_lock(&sctx->parent_root->root_item_lock); + sctx->parent_root->send_in_progress++; + if (!btrfs_root_readonly(sctx->parent_root) || + btrfs_root_dead(sctx->parent_root)) { + spin_unlock(&sctx->parent_root->root_item_lock); + ret = -EPERM; + goto out; + } + if (sctx->parent_root->dedupe_in_progress) { + dedupe_in_progress_warn(sctx->parent_root); + spin_unlock(&sctx->parent_root->root_item_lock); + ret = -EAGAIN; + goto out; + } + spin_unlock(&sctx->parent_root->root_item_lock); + } + + /* + * Clones from send_root are allowed, but only if the clone source + * is behind the current send position. This is checked while searching + * for possible clone sources. + */ + sctx->clone_roots[sctx->clone_roots_cnt++].root = + btrfs_grab_root(sctx->send_root); + + /* We do a bsearch later */ + sort(sctx->clone_roots, sctx->clone_roots_cnt, + sizeof(*sctx->clone_roots), __clone_root_cmp_sort, + NULL); + sort_clone_roots = 1; + + ret = flush_delalloc_roots(sctx); + if (ret) + goto out; + + ret = ensure_commit_roots_uptodate(sctx); + if (ret) + goto out; + + ret = send_subvol(sctx); + if (ret < 0) + goto out; + + btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) { + ret = send_utimes(sctx, entry->key, entry->gen); + if (ret < 0) + goto out; + btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry); + } + + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { + ret = begin_cmd(sctx, BTRFS_SEND_C_END); + if (ret < 0) + goto out; + ret = send_cmd(sctx); + if (ret < 0) + goto out; + } + +out: + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)); + while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) { + struct rb_node *n; + struct pending_dir_move *pm; + + n = rb_first(&sctx->pending_dir_moves); + pm = rb_entry(n, struct pending_dir_move, node); + while (!list_empty(&pm->list)) { + struct pending_dir_move *pm2; + + pm2 = list_first_entry(&pm->list, + struct pending_dir_move, list); + free_pending_move(sctx, pm2); + } + free_pending_move(sctx, pm); + } + + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)); + while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) { + struct rb_node *n; + struct waiting_dir_move *dm; + + n = rb_first(&sctx->waiting_dir_moves); + dm = rb_entry(n, struct waiting_dir_move, node); + rb_erase(&dm->node, &sctx->waiting_dir_moves); + kfree(dm); + } + + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs)); + while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) { + struct rb_node *n; + struct orphan_dir_info *odi; + + n = rb_first(&sctx->orphan_dirs); + odi = rb_entry(n, struct orphan_dir_info, node); + free_orphan_dir_info(sctx, odi); + } + + if (sort_clone_roots) { + for (i = 0; i < sctx->clone_roots_cnt; i++) { + btrfs_root_dec_send_in_progress( + sctx->clone_roots[i].root); + btrfs_put_root(sctx->clone_roots[i].root); + } + } else { + for (i = 0; sctx && i < clone_sources_to_rollback; i++) { + btrfs_root_dec_send_in_progress( + sctx->clone_roots[i].root); + btrfs_put_root(sctx->clone_roots[i].root); + } + + btrfs_root_dec_send_in_progress(send_root); + } + if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) { + btrfs_root_dec_send_in_progress(sctx->parent_root); + btrfs_put_root(sctx->parent_root); + } + + kvfree(clone_sources_tmp); + + if (sctx) { + if (sctx->send_filp) + fput(sctx->send_filp); + + kvfree(sctx->clone_roots); + kfree(sctx->send_buf_pages); + kvfree(sctx->send_buf); + kvfree(sctx->verity_descriptor); + + close_current_inode(sctx); + + btrfs_lru_cache_clear(&sctx->name_cache); + btrfs_lru_cache_clear(&sctx->backref_cache); + btrfs_lru_cache_clear(&sctx->dir_created_cache); + btrfs_lru_cache_clear(&sctx->dir_utimes_cache); + + kfree(sctx); + } + + return ret; +} diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h new file mode 100644 index 0000000000..4f5509cb18 --- /dev/null +++ b/fs/btrfs/send.h @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2012 Alexander Block. All rights reserved. + * Copyright (C) 2012 STRATO. All rights reserved. + */ + +#ifndef BTRFS_SEND_H +#define BTRFS_SEND_H + +#include + +#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" +/* Conditional support for the upcoming protocol version. */ +#ifdef CONFIG_BTRFS_DEBUG +#define BTRFS_SEND_STREAM_VERSION 3 +#else +#define BTRFS_SEND_STREAM_VERSION 2 +#endif + +/* + * In send stream v1, no command is larger than 64K. In send stream v2, no + * limit should be assumed, the buffer size is set to be a header with + * compressed extent size. + */ +#define BTRFS_SEND_BUF_SIZE_V1 SZ_64K +#define BTRFS_SEND_BUF_SIZE_V2 ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE) + +struct inode; +struct btrfs_ioctl_send_args; + +enum btrfs_tlv_type { + BTRFS_TLV_U8, + BTRFS_TLV_U16, + BTRFS_TLV_U32, + BTRFS_TLV_U64, + BTRFS_TLV_BINARY, + BTRFS_TLV_STRING, + BTRFS_TLV_UUID, + BTRFS_TLV_TIMESPEC, +}; + +struct btrfs_stream_header { + char magic[sizeof(BTRFS_SEND_STREAM_MAGIC)]; + __le32 version; +} __attribute__ ((__packed__)); + +struct btrfs_cmd_header { + /* len excluding the header */ + __le32 len; + __le16 cmd; + /* crc including the header with zero crc field */ + __le32 crc; +} __attribute__ ((__packed__)); + +struct btrfs_tlv_header { + __le16 tlv_type; + /* len excluding the header */ + __le16 tlv_len; +} __attribute__ ((__packed__)); + +/* commands */ +enum btrfs_send_cmd { + BTRFS_SEND_C_UNSPEC = 0, + + /* Version 1 */ + BTRFS_SEND_C_SUBVOL = 1, + BTRFS_SEND_C_SNAPSHOT = 2, + + BTRFS_SEND_C_MKFILE = 3, + BTRFS_SEND_C_MKDIR = 4, + BTRFS_SEND_C_MKNOD = 5, + BTRFS_SEND_C_MKFIFO = 6, + BTRFS_SEND_C_MKSOCK = 7, + BTRFS_SEND_C_SYMLINK = 8, + + BTRFS_SEND_C_RENAME = 9, + BTRFS_SEND_C_LINK = 10, + BTRFS_SEND_C_UNLINK = 11, + BTRFS_SEND_C_RMDIR = 12, + + BTRFS_SEND_C_SET_XATTR = 13, + BTRFS_SEND_C_REMOVE_XATTR = 14, + + BTRFS_SEND_C_WRITE = 15, + BTRFS_SEND_C_CLONE = 16, + + BTRFS_SEND_C_TRUNCATE = 17, + BTRFS_SEND_C_CHMOD = 18, + BTRFS_SEND_C_CHOWN = 19, + BTRFS_SEND_C_UTIMES = 20, + + BTRFS_SEND_C_END = 21, + BTRFS_SEND_C_UPDATE_EXTENT = 22, + BTRFS_SEND_C_MAX_V1 = 22, + + /* Version 2 */ + BTRFS_SEND_C_FALLOCATE = 23, + BTRFS_SEND_C_FILEATTR = 24, + BTRFS_SEND_C_ENCODED_WRITE = 25, + BTRFS_SEND_C_MAX_V2 = 25, + + /* Version 3 */ + BTRFS_SEND_C_ENABLE_VERITY = 26, + BTRFS_SEND_C_MAX_V3 = 26, + /* End */ + BTRFS_SEND_C_MAX = 26, +}; + +/* attributes in send stream */ +enum { + BTRFS_SEND_A_UNSPEC = 0, + + /* Version 1 */ + BTRFS_SEND_A_UUID = 1, + BTRFS_SEND_A_CTRANSID = 2, + + BTRFS_SEND_A_INO = 3, + BTRFS_SEND_A_SIZE = 4, + BTRFS_SEND_A_MODE = 5, + BTRFS_SEND_A_UID = 6, + BTRFS_SEND_A_GID = 7, + BTRFS_SEND_A_RDEV = 8, + BTRFS_SEND_A_CTIME = 9, + BTRFS_SEND_A_MTIME = 10, + BTRFS_SEND_A_ATIME = 11, + BTRFS_SEND_A_OTIME = 12, + + BTRFS_SEND_A_XATTR_NAME = 13, + BTRFS_SEND_A_XATTR_DATA = 14, + + BTRFS_SEND_A_PATH = 15, + BTRFS_SEND_A_PATH_TO = 16, + BTRFS_SEND_A_PATH_LINK = 17, + + BTRFS_SEND_A_FILE_OFFSET = 18, + /* + * As of send stream v2, this attribute is special: it must be the last + * attribute in a command, its header contains only the type, and its + * length is implicitly the remaining length of the command. + */ + BTRFS_SEND_A_DATA = 19, + + BTRFS_SEND_A_CLONE_UUID = 20, + BTRFS_SEND_A_CLONE_CTRANSID = 21, + BTRFS_SEND_A_CLONE_PATH = 22, + BTRFS_SEND_A_CLONE_OFFSET = 23, + BTRFS_SEND_A_CLONE_LEN = 24, + + BTRFS_SEND_A_MAX_V1 = 24, + + /* Version 2 */ + BTRFS_SEND_A_FALLOCATE_MODE = 25, + + /* + * File attributes from the FS_*_FL namespace (i_flags, xflags), + * translated to BTRFS_INODE_* bits (BTRFS_INODE_FLAG_MASK) and stored + * in btrfs_inode_item::flags (represented by btrfs_inode::flags and + * btrfs_inode::ro_flags). + */ + BTRFS_SEND_A_FILEATTR = 26, + + BTRFS_SEND_A_UNENCODED_FILE_LEN = 27, + BTRFS_SEND_A_UNENCODED_LEN = 28, + BTRFS_SEND_A_UNENCODED_OFFSET = 29, + /* + * COMPRESSION and ENCRYPTION default to NONE (0) if omitted from + * BTRFS_SEND_C_ENCODED_WRITE. + */ + BTRFS_SEND_A_COMPRESSION = 30, + BTRFS_SEND_A_ENCRYPTION = 31, + BTRFS_SEND_A_MAX_V2 = 31, + + /* Version 3 */ + BTRFS_SEND_A_VERITY_ALGORITHM = 32, + BTRFS_SEND_A_VERITY_BLOCK_SIZE = 33, + BTRFS_SEND_A_VERITY_SALT_DATA = 34, + BTRFS_SEND_A_VERITY_SIG_DATA = 35, + BTRFS_SEND_A_MAX_V3 = 35, + + __BTRFS_SEND_A_MAX = 35, +}; + +long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg); + +#endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c new file mode 100644 index 0000000000..d7e8cd4f14 --- /dev/null +++ b/fs/btrfs/space-info.c @@ -0,0 +1,1853 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "misc.h" +#include "ctree.h" +#include "space-info.h" +#include "sysfs.h" +#include "volumes.h" +#include "free-space-cache.h" +#include "ordered-data.h" +#include "transaction.h" +#include "block-group.h" +#include "zoned.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" + +/* + * HOW DOES SPACE RESERVATION WORK + * + * If you want to know about delalloc specifically, there is a separate comment + * for that with the delalloc code. This comment is about how the whole system + * works generally. + * + * BASIC CONCEPTS + * + * 1) space_info. This is the ultimate arbiter of how much space we can use. + * There's a description of the bytes_ fields with the struct declaration, + * refer to that for specifics on each field. Suffice it to say that for + * reservations we care about total_bytes - SUM(space_info->bytes_) when + * determining if there is space to make an allocation. There is a space_info + * for METADATA, SYSTEM, and DATA areas. + * + * 2) block_rsv's. These are basically buckets for every different type of + * metadata reservation we have. You can see the comment in the block_rsv + * code on the rules for each type, but generally block_rsv->reserved is how + * much space is accounted for in space_info->bytes_may_use. + * + * 3) btrfs_calc*_size. These are the worst case calculations we used based + * on the number of items we will want to modify. We have one for changing + * items, and one for inserting new items. Generally we use these helpers to + * determine the size of the block reserves, and then use the actual bytes + * values to adjust the space_info counters. + * + * MAKING RESERVATIONS, THE NORMAL CASE + * + * We call into either btrfs_reserve_data_bytes() or + * btrfs_reserve_metadata_bytes(), depending on which we're looking for, with + * num_bytes we want to reserve. + * + * ->reserve + * space_info->bytes_may_reserve += num_bytes + * + * ->extent allocation + * Call btrfs_add_reserved_bytes() which does + * space_info->bytes_may_reserve -= num_bytes + * space_info->bytes_reserved += extent_bytes + * + * ->insert reference + * Call btrfs_update_block_group() which does + * space_info->bytes_reserved -= extent_bytes + * space_info->bytes_used += extent_bytes + * + * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority) + * + * Assume we are unable to simply make the reservation because we do not have + * enough space + * + * -> __reserve_bytes + * create a reserve_ticket with ->bytes set to our reservation, add it to + * the tail of space_info->tickets, kick async flush thread + * + * ->handle_reserve_ticket + * wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set + * on the ticket. + * + * -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space + * Flushes various things attempting to free up space. + * + * -> btrfs_try_granting_tickets() + * This is called by anything that either subtracts space from + * space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the + * space_info->total_bytes. This loops through the ->priority_tickets and + * then the ->tickets list checking to see if the reservation can be + * completed. If it can the space is added to space_info->bytes_may_use and + * the ticket is woken up. + * + * -> ticket wakeup + * Check if ->bytes == 0, if it does we got our reservation and we can carry + * on, if not return the appropriate error (ENOSPC, but can be EINTR if we + * were interrupted.) + * + * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY + * + * Same as the above, except we add ourselves to the + * space_info->priority_tickets, and we do not use ticket->wait, we simply + * call flush_space() ourselves for the states that are safe for us to call + * without deadlocking and hope for the best. + * + * THE FLUSHING STATES + * + * Generally speaking we will have two cases for each state, a "nice" state + * and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to + * reduce the locking over head on the various trees, and even to keep from + * doing any work at all in the case of delayed refs. Each of these delayed + * things however hold reservations, and so letting them run allows us to + * reclaim space so we can make new reservations. + * + * FLUSH_DELAYED_ITEMS + * Every inode has a delayed item to update the inode. Take a simple write + * for example, we would update the inode item at write time to update the + * mtime, and then again at finish_ordered_io() time in order to update the + * isize or bytes. We keep these delayed items to coalesce these operations + * into a single operation done on demand. These are an easy way to reclaim + * metadata space. + * + * FLUSH_DELALLOC + * Look at the delalloc comment to get an idea of how much space is reserved + * for delayed allocation. We can reclaim some of this space simply by + * running delalloc, but usually we need to wait for ordered extents to + * reclaim the bulk of this space. + * + * FLUSH_DELAYED_REFS + * We have a block reserve for the outstanding delayed refs space, and every + * delayed ref operation holds a reservation. Running these is a quick way + * to reclaim space, but we want to hold this until the end because COW can + * churn a lot and we can avoid making some extent tree modifications if we + * are able to delay for as long as possible. + * + * ALLOC_CHUNK + * We will skip this the first time through space reservation, because of + * overcommit and we don't want to have a lot of useless metadata space when + * our worst case reservations will likely never come true. + * + * RUN_DELAYED_IPUTS + * If we're freeing inodes we're likely freeing checksums, file extent + * items, and extent tree items. Loads of space could be freed up by these + * operations, however they won't be usable until the transaction commits. + * + * COMMIT_TRANS + * This will commit the transaction. Historically we had a lot of logic + * surrounding whether or not we'd commit the transaction, but this waits born + * out of a pre-tickets era where we could end up committing the transaction + * thousands of times in a row without making progress. Now thanks to our + * ticketing system we know if we're not making progress and can error + * everybody out after a few commits rather than burning the disk hoping for + * a different answer. + * + * OVERCOMMIT + * + * Because we hold so many reservations for metadata we will allow you to + * reserve more space than is currently free in the currently allocate + * metadata space. This only happens with metadata, data does not allow + * overcommitting. + * + * You can see the current logic for when we allow overcommit in + * btrfs_can_overcommit(), but it only applies to unallocated space. If there + * is no unallocated space to be had, all reservations are kept within the + * free space in the allocated metadata chunks. + * + * Because of overcommitting, you generally want to use the + * btrfs_can_overcommit() logic for metadata allocations, as it does the right + * thing with or without extra unallocated space. + */ + +u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, + bool may_use_included) +{ + ASSERT(s_info); + return s_info->bytes_used + s_info->bytes_reserved + + s_info->bytes_pinned + s_info->bytes_readonly + + s_info->bytes_zone_unusable + + (may_use_included ? s_info->bytes_may_use : 0); +} + +/* + * after adding space to the filesystem, we need to clear the full flags + * on all the space infos. + */ +void btrfs_clear_space_info_full(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + list_for_each_entry(found, head, list) + found->full = 0; +} + +/* + * Block groups with more than this value (percents) of unusable space will be + * scheduled for background reclaim. + */ +#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) + +/* + * Calculate chunk size depending on volume type (regular or zoned). + */ +static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) +{ + if (btrfs_is_zoned(fs_info)) + return fs_info->zone_size; + + ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + return BTRFS_MAX_DATA_CHUNK_SIZE; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + return SZ_32M; + + /* Handle BTRFS_BLOCK_GROUP_METADATA */ + if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G) + return SZ_1G; + + return SZ_256M; +} + +/* + * Update default chunk size. + */ +void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, + u64 chunk_size) +{ + WRITE_ONCE(space_info->chunk_size, chunk_size); +} + +static int create_space_info(struct btrfs_fs_info *info, u64 flags) +{ + + struct btrfs_space_info *space_info; + int i; + int ret; + + space_info = kzalloc(sizeof(*space_info), GFP_NOFS); + if (!space_info) + return -ENOMEM; + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + INIT_LIST_HEAD(&space_info->block_groups[i]); + init_rwsem(&space_info->groups_sem); + spin_lock_init(&space_info->lock); + space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; + INIT_LIST_HEAD(&space_info->ro_bgs); + INIT_LIST_HEAD(&space_info->tickets); + INIT_LIST_HEAD(&space_info->priority_tickets); + space_info->clamp = 1; + btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags)); + + if (btrfs_is_zoned(info)) + space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; + + ret = btrfs_sysfs_add_space_info_type(info, space_info); + if (ret) + return ret; + + list_add(&space_info->list, &info->space_info); + if (flags & BTRFS_BLOCK_GROUP_DATA) + info->data_sinfo = space_info; + + return ret; +} + +int btrfs_init_space_info(struct btrfs_fs_info *fs_info) +{ + struct btrfs_super_block *disk_super; + u64 features; + u64 flags; + int mixed = 0; + int ret; + + disk_super = fs_info->super_copy; + if (!btrfs_super_root(disk_super)) + return -EINVAL; + + features = btrfs_super_incompat_flags(disk_super); + if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = 1; + + flags = BTRFS_BLOCK_GROUP_SYSTEM; + ret = create_space_info(fs_info, flags); + if (ret) + goto out; + + if (mixed) { + flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; + ret = create_space_info(fs_info, flags); + } else { + flags = BTRFS_BLOCK_GROUP_METADATA; + ret = create_space_info(fs_info, flags); + if (ret) + goto out; + + flags = BTRFS_BLOCK_GROUP_DATA; + ret = create_space_info(fs_info, flags); + } +out: + return ret; +} + +void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, + struct btrfs_block_group *block_group) +{ + struct btrfs_space_info *found; + int factor, index; + + factor = btrfs_bg_type_to_factor(block_group->flags); + + found = btrfs_find_space_info(info, block_group->flags); + ASSERT(found); + spin_lock(&found->lock); + found->total_bytes += block_group->length; + found->disk_total += block_group->length * factor; + found->bytes_used += block_group->used; + found->disk_used += block_group->used * factor; + found->bytes_readonly += block_group->bytes_super; + found->bytes_zone_unusable += block_group->zone_unusable; + if (block_group->length > 0) + found->full = 0; + btrfs_try_granting_tickets(info, found); + spin_unlock(&found->lock); + + block_group->space_info = found; + + index = btrfs_bg_flags_to_raid_index(block_group->flags); + down_write(&found->groups_sem); + list_add_tail(&block_group->list, &found->block_groups[index]); + up_write(&found->groups_sem); +} + +struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, + u64 flags) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; + + list_for_each_entry(found, head, list) { + if (found->flags & flags) + return found; + } + return NULL; +} + +static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + enum btrfs_reserve_flush_enum flush) +{ + u64 profile; + u64 avail; + int factor; + + if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) + profile = btrfs_system_alloc_profile(fs_info); + else + profile = btrfs_metadata_alloc_profile(fs_info); + + avail = atomic64_read(&fs_info->free_chunk_space); + + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually usable. For raid56, the space info used + * doesn't include the parity drive, so we don't have to + * change the math + */ + factor = btrfs_bg_type_to_factor(profile); + avail = div_u64(avail, factor); + + /* + * If we aren't flushing all things, let us overcommit up to + * 1/2th of the space. If we can flush, don't let us overcommit + * too much, let it overcommit up to 1/8 of the space. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL) + avail >>= 3; + else + avail >>= 1; + return avail; +} + +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + u64 avail; + u64 used; + + /* Don't overcommit when in mixed mode */ + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) + return 0; + + used = btrfs_space_info_used(space_info, true); + avail = calc_available_free_space(fs_info, space_info, flush); + + if (used + bytes < space_info->total_bytes + avail) + return 1; + return 0; +} + +static void remove_ticket(struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) +{ + if (!list_empty(&ticket->list)) { + list_del_init(&ticket->list); + ASSERT(space_info->reclaim_size >= ticket->bytes); + space_info->reclaim_size -= ticket->bytes; + } +} + +/* + * This is for space we already have accounted in space_info->bytes_may_use, so + * basically when we're returning space from block_rsv's. + */ +void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + struct list_head *head; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; + + lockdep_assert_held(&space_info->lock); + + head = &space_info->priority_tickets; +again: + while (!list_empty(head)) { + struct reserve_ticket *ticket; + u64 used = btrfs_space_info_used(space_info, true); + + ticket = list_first_entry(head, struct reserve_ticket, list); + + /* Check and see if our ticket can be satisfied now. */ + if ((used + ticket->bytes <= space_info->total_bytes) || + btrfs_can_overcommit(fs_info, space_info, ticket->bytes, + flush)) { + btrfs_space_info_update_bytes_may_use(fs_info, + space_info, + ticket->bytes); + remove_ticket(space_info, ticket); + ticket->bytes = 0; + space_info->tickets_id++; + wake_up(&ticket->wait); + } else { + break; + } + } + + if (head == &space_info->priority_tickets) { + head = &space_info->tickets; + flush = BTRFS_RESERVE_FLUSH_ALL; + goto again; + } +} + +#define DUMP_BLOCK_RSV(fs_info, rsv_name) \ +do { \ + struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ + spin_lock(&__rsv->lock); \ + btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ + __rsv->size, __rsv->reserved); \ + spin_unlock(&__rsv->lock); \ +} while (0) + +static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info) +{ + switch (space_info->flags) { + case BTRFS_BLOCK_GROUP_SYSTEM: + return "SYSTEM"; + case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: + return "DATA+METADATA"; + case BTRFS_BLOCK_GROUP_DATA: + return "DATA"; + case BTRFS_BLOCK_GROUP_METADATA: + return "METADATA"; + default: + return "UNKNOWN"; + } +} + +static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + DUMP_BLOCK_RSV(fs_info, global_block_rsv); + DUMP_BLOCK_RSV(fs_info, trans_block_rsv); + DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); +} + +static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *info) +{ + const char *flag_str = space_info_flag_to_str(info); + lockdep_assert_held(&info->lock); + + /* The free space could be negative in case of overcommit */ + btrfs_info(fs_info, "space_info %s has %lld free, is %sfull", + flag_str, + (s64)(info->total_bytes - btrfs_space_info_used(info, true)), + info->full ? "" : "not "); + btrfs_info(fs_info, +"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", + info->total_bytes, info->bytes_used, info->bytes_pinned, + info->bytes_reserved, info->bytes_may_use, + info->bytes_readonly, info->bytes_zone_unusable); +} + +void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *info, u64 bytes, + int dump_block_groups) +{ + struct btrfs_block_group *cache; + u64 total_avail = 0; + int index = 0; + + spin_lock(&info->lock); + __btrfs_dump_space_info(fs_info, info); + dump_global_block_rsv(fs_info); + spin_unlock(&info->lock); + + if (!dump_block_groups) + return; + + down_read(&info->groups_sem); +again: + list_for_each_entry(cache, &info->block_groups[index], list) { + u64 avail; + + spin_lock(&cache->lock); + avail = cache->length - cache->used - cache->pinned - + cache->reserved - cache->delalloc_bytes - + cache->bytes_super - cache->zone_unusable; + btrfs_info(fs_info, +"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s", + cache->start, cache->length, cache->used, cache->pinned, + cache->reserved, cache->delalloc_bytes, + cache->bytes_super, cache->zone_unusable, + avail, cache->ro ? "[readonly]" : ""); + spin_unlock(&cache->lock); + btrfs_dump_free_space(cache, bytes); + total_avail += avail; + } + if (++index < BTRFS_NR_RAID_TYPES) + goto again; + up_read(&info->groups_sem); + + btrfs_info(fs_info, "%llu bytes available across all block groups", total_avail); +} + +static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, + u64 to_reclaim) +{ + u64 bytes; + u64 nr; + + bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + nr = div64_u64(to_reclaim, bytes); + if (!nr) + nr = 1; + return nr; +} + +static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info, + u64 to_reclaim) +{ + const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1); + u64 nr; + + nr = div64_u64(to_reclaim, bytes); + if (!nr) + nr = 1; + return nr; +} + +#define EXTENT_SIZE_PER_ITEM SZ_256K + +/* + * shrink metadata reservation for delalloc + */ +static void shrink_delalloc(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 to_reclaim, bool wait_ordered, + bool for_preempt) +{ + struct btrfs_trans_handle *trans; + u64 delalloc_bytes; + u64 ordered_bytes; + u64 items; + long time_left; + int loops; + + delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes); + ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes); + if (delalloc_bytes == 0 && ordered_bytes == 0) + return; + + /* Calc the number of the pages we need flush for space reservation */ + if (to_reclaim == U64_MAX) { + items = U64_MAX; + } else { + /* + * to_reclaim is set to however much metadata we need to + * reclaim, but reclaiming that much data doesn't really track + * exactly. What we really want to do is reclaim full inode's + * worth of reservations, however that's not available to us + * here. We will take a fraction of the delalloc bytes for our + * flushing loops and hope for the best. Delalloc will expand + * the amount we write to cover an entire dirty extent, which + * will reclaim the metadata reservation for that range. If + * it's not enough subsequent flush stages will be more + * aggressive. + */ + to_reclaim = max(to_reclaim, delalloc_bytes >> 3); + items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; + } + + trans = current->journal_info; + + /* + * If we are doing more ordered than delalloc we need to just wait on + * ordered extents, otherwise we'll waste time trying to flush delalloc + * that likely won't give us the space back we need. + */ + if (ordered_bytes > delalloc_bytes && !for_preempt) + wait_ordered = true; + + loops = 0; + while ((delalloc_bytes || ordered_bytes) && loops < 3) { + u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; + long nr_pages = min_t(u64, temp, LONG_MAX); + int async_pages; + + btrfs_start_delalloc_roots(fs_info, nr_pages, true); + + /* + * We need to make sure any outstanding async pages are now + * processed before we continue. This is because things like + * sync_inode() try to be smart and skip writing if the inode is + * marked clean. We don't use filemap_fwrite for flushing + * because we want to control how many pages we write out at a + * time, thus this is the only safe way to make sure we've + * waited for outstanding compressed workers to have started + * their jobs and thus have ordered extents set up properly. + * + * This exists because we do not want to wait for each + * individual inode to finish its async work, we simply want to + * start the IO on everybody, and then come back here and wait + * for all of the async work to catch up. Once we're done with + * that we know we'll have ordered extents for everything and we + * can decide if we wait for that or not. + * + * If we choose to replace this in the future, make absolutely + * sure that the proper waiting is being done in the async case, + * as there have been bugs in that area before. + */ + async_pages = atomic_read(&fs_info->async_delalloc_pages); + if (!async_pages) + goto skip_async; + + /* + * We don't want to wait forever, if we wrote less pages in this + * loop than we have outstanding, only wait for that number of + * pages, otherwise we can wait for all async pages to finish + * before continuing. + */ + if (async_pages > nr_pages) + async_pages -= nr_pages; + else + async_pages = 0; + wait_event(fs_info->async_submit_wait, + atomic_read(&fs_info->async_delalloc_pages) <= + async_pages); +skip_async: + loops++; + if (wait_ordered && !trans) { + btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + } else { + time_left = schedule_timeout_killable(1); + if (time_left) + break; + } + + /* + * If we are for preemption we just want a one-shot of delalloc + * flushing so we can stop flushing if we decide we don't need + * to anymore. + */ + if (for_preempt) + break; + + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets) && + list_empty(&space_info->priority_tickets)) { + spin_unlock(&space_info->lock); + break; + } + spin_unlock(&space_info->lock); + + delalloc_bytes = percpu_counter_sum_positive( + &fs_info->delalloc_bytes); + ordered_bytes = percpu_counter_sum_positive( + &fs_info->ordered_bytes); + } +} + +/* + * Try to flush some data based on policy set by @state. This is only advisory + * and may fail for various reasons. The caller is supposed to examine the + * state of @space_info to detect the outcome. + */ +static void flush_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 num_bytes, + enum btrfs_flush_state state, bool for_preempt) +{ + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_trans_handle *trans; + int nr; + int ret = 0; + + switch (state) { + case FLUSH_DELAYED_ITEMS_NR: + case FLUSH_DELAYED_ITEMS: + if (state == FLUSH_DELAYED_ITEMS_NR) + nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; + else + nr = -1; + + trans = btrfs_join_transaction_nostart(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; + break; + } + ret = btrfs_run_delayed_items_nr(trans, nr); + btrfs_end_transaction(trans); + break; + case FLUSH_DELALLOC: + case FLUSH_DELALLOC_WAIT: + case FLUSH_DELALLOC_FULL: + if (state == FLUSH_DELALLOC_FULL) + num_bytes = U64_MAX; + shrink_delalloc(fs_info, space_info, num_bytes, + state != FLUSH_DELALLOC, for_preempt); + break; + case FLUSH_DELAYED_REFS_NR: + case FLUSH_DELAYED_REFS: + trans = btrfs_join_transaction_nostart(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; + break; + } + if (state == FLUSH_DELAYED_REFS_NR) + nr = calc_delayed_refs_nr(fs_info, num_bytes); + else + nr = 0; + btrfs_run_delayed_refs(trans, nr); + btrfs_end_transaction(trans); + break; + case ALLOC_CHUNK: + case ALLOC_CHUNK_FORCE: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_chunk_alloc(trans, + btrfs_get_alloc_profile(fs_info, space_info->flags), + (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : + CHUNK_ALLOC_FORCE); + btrfs_end_transaction(trans); + + if (ret > 0 || ret == -ENOSPC) + ret = 0; + break; + case RUN_DELAYED_IPUTS: + /* + * If we have pending delayed iputs then we could free up a + * bunch of pinned space, so make sure we run the iputs before + * we do our pinned bytes check below. + */ + btrfs_run_delayed_iputs(fs_info); + btrfs_wait_on_delayed_iputs(fs_info); + break; + case COMMIT_TRANS: + ASSERT(current->journal_info == NULL); + /* + * We don't want to start a new transaction, just attach to the + * current one or wait it fully commits in case its commit is + * happening at the moment. Note: we don't use a nostart join + * because that does not wait for a transaction to fully commit + * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED). + */ + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; + break; + } + ret = btrfs_commit_transaction(trans); + break; + default: + ret = -ENOSPC; + break; + } + + trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, + ret, for_preempt); + return; +} + +static inline u64 +btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + u64 used; + u64 avail; + u64 to_reclaim = space_info->reclaim_size; + + lockdep_assert_held(&space_info->lock); + + avail = calc_available_free_space(fs_info, space_info, + BTRFS_RESERVE_FLUSH_ALL); + used = btrfs_space_info_used(space_info, true); + + /* + * We may be flushing because suddenly we have less space than we had + * before, and now we're well over-committed based on our current free + * space. If that's the case add in our overage so we make sure to put + * appropriate pressure on the flushing state machine. + */ + if (space_info->total_bytes + avail < used) + to_reclaim += used - (space_info->total_bytes + avail); + + return to_reclaim; +} + +static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + u64 global_rsv_size = fs_info->global_block_rsv.reserved; + u64 ordered, delalloc; + u64 thresh; + u64 used; + + thresh = mult_perc(space_info->total_bytes, 90); + + lockdep_assert_held(&space_info->lock); + + /* If we're just plain full then async reclaim just slows us down. */ + if ((space_info->bytes_used + space_info->bytes_reserved + + global_rsv_size) >= thresh) + return false; + + used = space_info->bytes_may_use + space_info->bytes_pinned; + + /* The total flushable belongs to the global rsv, don't flush. */ + if (global_rsv_size >= used) + return false; + + /* + * 128MiB is 1/4 of the maximum global rsv size. If we have less than + * that devoted to other reservations then there's no sense in flushing, + * we don't have a lot of things that need flushing. + */ + if (used - global_rsv_size <= SZ_128M) + return false; + + /* + * We have tickets queued, bail so we don't compete with the async + * flushers. + */ + if (space_info->reclaim_size) + return false; + + /* + * If we have over half of the free space occupied by reservations or + * pinned then we want to start flushing. + * + * We do not do the traditional thing here, which is to say + * + * if (used >= ((total_bytes + avail) / 2)) + * return 1; + * + * because this doesn't quite work how we want. If we had more than 50% + * of the space_info used by bytes_used and we had 0 available we'd just + * constantly run the background flusher. Instead we want it to kick in + * if our reclaimable space exceeds our clamped free space. + * + * Our clamping range is 2^1 -> 2^8. Practically speaking that means + * the following: + * + * Amount of RAM Minimum threshold Maximum threshold + * + * 256GiB 1GiB 128GiB + * 128GiB 512MiB 64GiB + * 64GiB 256MiB 32GiB + * 32GiB 128MiB 16GiB + * 16GiB 64MiB 8GiB + * + * These are the range our thresholds will fall in, corresponding to how + * much delalloc we need for the background flusher to kick in. + */ + + thresh = calc_available_free_space(fs_info, space_info, + BTRFS_RESERVE_FLUSH_ALL); + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_readonly + global_rsv_size; + if (used < space_info->total_bytes) + thresh += space_info->total_bytes - used; + thresh >>= space_info->clamp; + + used = space_info->bytes_pinned; + + /* + * If we have more ordered bytes than delalloc bytes then we're either + * doing a lot of DIO, or we simply don't have a lot of delalloc waiting + * around. Preemptive flushing is only useful in that it can free up + * space before tickets need to wait for things to finish. In the case + * of ordered extents, preemptively waiting on ordered extents gets us + * nothing, if our reservations are tied up in ordered extents we'll + * simply have to slow down writers by forcing them to wait on ordered + * extents. + * + * In the case that ordered is larger than delalloc, only include the + * block reserves that we would actually be able to directly reclaim + * from. In this case if we're heavy on metadata operations this will + * clearly be heavy enough to warrant preemptive flushing. In the case + * of heavy DIO or ordered reservations, preemptive flushing will just + * waste time and cause us to slow down. + * + * We want to make sure we truly are maxed out on ordered however, so + * cut ordered in half, and if it's still higher than delalloc then we + * can keep flushing. This is to avoid the case where we start + * flushing, and now delalloc == ordered and we stop preemptively + * flushing when we could still have several gigs of delalloc to flush. + */ + ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1; + delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes); + if (ordered >= delalloc) + used += fs_info->delayed_refs_rsv.reserved + + fs_info->delayed_block_rsv.reserved; + else + used += space_info->bytes_may_use - global_rsv_size; + + return (used >= thresh && !btrfs_fs_closing(fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); +} + +static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 min_bytes; + + if (!ticket->steal) + return false; + + if (global_rsv->space_info != space_info) + return false; + + spin_lock(&global_rsv->lock); + min_bytes = mult_perc(global_rsv->size, 10); + if (global_rsv->reserved < min_bytes + ticket->bytes) { + spin_unlock(&global_rsv->lock); + return false; + } + global_rsv->reserved -= ticket->bytes; + remove_ticket(space_info, ticket); + ticket->bytes = 0; + wake_up(&ticket->wait); + space_info->tickets_id++; + if (global_rsv->reserved < global_rsv->size) + global_rsv->full = 0; + spin_unlock(&global_rsv->lock); + + return true; +} + +/* + * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets + * @fs_info - fs_info for this fs + * @space_info - the space info we were flushing + * + * We call this when we've exhausted our flushing ability and haven't made + * progress in satisfying tickets. The reservation code handles tickets in + * order, so if there is a large ticket first and then smaller ones we could + * very well satisfy the smaller tickets. This will attempt to wake up any + * tickets in the list to catch this case. + * + * This function returns true if it was able to make progress by clearing out + * other tickets, or if it stumbles across a ticket that was smaller than the + * first ticket. + */ +static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + struct reserve_ticket *ticket; + u64 tickets_id = space_info->tickets_id; + const bool aborted = BTRFS_FS_ERROR(fs_info); + + trace_btrfs_fail_all_tickets(fs_info, space_info); + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); + __btrfs_dump_space_info(fs_info, space_info); + } + + while (!list_empty(&space_info->tickets) && + tickets_id == space_info->tickets_id) { + ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + + if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket)) + return true; + + if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_info(fs_info, "failing ticket with %llu bytes", + ticket->bytes); + + remove_ticket(space_info, ticket); + if (aborted) + ticket->error = -EIO; + else + ticket->error = -ENOSPC; + wake_up(&ticket->wait); + + /* + * We're just throwing tickets away, so more flushing may not + * trip over btrfs_try_granting_tickets, so we need to call it + * here to see if we can make progress with the next ticket in + * the list. + */ + if (!aborted) + btrfs_try_granting_tickets(fs_info, space_info); + } + return (tickets_id != space_info->tickets_id); +} + +/* + * This is for normal flushers, we can wait all goddamned day if we want to. We + * will loop and continuously try to flush as long as we are making progress. + * We count progress as clearing off tickets each time we have to loop. + */ +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 to_reclaim; + enum btrfs_flush_state flush_state; + int commit_cycles = 0; + u64 last_tickets_id; + + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + + spin_lock(&space_info->lock); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); + if (!to_reclaim) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + + flush_state = FLUSH_DELAYED_ITEMS_NR; + do { + flush_space(fs_info, space_info, to_reclaim, flush_state, false); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, + space_info); + if (last_tickets_id == space_info->tickets_id) { + flush_state++; + } else { + last_tickets_id = space_info->tickets_id; + flush_state = FLUSH_DELAYED_ITEMS_NR; + if (commit_cycles) + commit_cycles--; + } + + /* + * We do not want to empty the system of delalloc unless we're + * under heavy pressure, so allow one trip through the flushing + * logic before we start doing a FLUSH_DELALLOC_FULL. + */ + if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles) + flush_state++; + + /* + * We don't want to force a chunk allocation until we've tried + * pretty hard to reclaim space. Think of the case where we + * freed up a bunch of space and so have a lot of pinned space + * to reclaim. We would rather use that than possibly create a + * underutilized metadata chunk. So if this is our first run + * through the flushing state machine skip ALLOC_CHUNK_FORCE and + * commit the transaction. If nothing has changed the next go + * around then we can force a chunk allocation. + */ + if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) + flush_state++; + + if (flush_state > COMMIT_TRANS) { + commit_cycles++; + if (commit_cycles > 2) { + if (maybe_fail_all_tickets(fs_info, space_info)) { + flush_state = FLUSH_DELAYED_ITEMS_NR; + commit_cycles--; + } else { + space_info->flush = 0; + } + } else { + flush_state = FLUSH_DELAYED_ITEMS_NR; + } + } + spin_unlock(&space_info->lock); + } while (flush_state <= COMMIT_TRANS); +} + +/* + * This handles pre-flushing of metadata space before we get to the point that + * we need to start blocking threads on tickets. The logic here is different + * from the other flush paths because it doesn't rely on tickets to tell us how + * much we need to flush, instead it attempts to keep us below the 80% full + * watermark of space by flushing whichever reservation pool is currently the + * largest. + */ +static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + struct btrfs_block_rsv *delayed_block_rsv; + struct btrfs_block_rsv *delayed_refs_rsv; + struct btrfs_block_rsv *global_rsv; + struct btrfs_block_rsv *trans_rsv; + int loops = 0; + + fs_info = container_of(work, struct btrfs_fs_info, + preempt_reclaim_work); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + delayed_block_rsv = &fs_info->delayed_block_rsv; + delayed_refs_rsv = &fs_info->delayed_refs_rsv; + global_rsv = &fs_info->global_block_rsv; + trans_rsv = &fs_info->trans_block_rsv; + + spin_lock(&space_info->lock); + while (need_preemptive_reclaim(fs_info, space_info)) { + enum btrfs_flush_state flush; + u64 delalloc_size = 0; + u64 to_reclaim, block_rsv_size; + u64 global_rsv_size = global_rsv->reserved; + + loops++; + + /* + * We don't have a precise counter for the metadata being + * reserved for delalloc, so we'll approximate it by subtracting + * out the block rsv's space from the bytes_may_use. If that + * amount is higher than the individual reserves, then we can + * assume it's tied up in delalloc reservations. + */ + block_rsv_size = global_rsv_size + + delayed_block_rsv->reserved + + delayed_refs_rsv->reserved + + trans_rsv->reserved; + if (block_rsv_size < space_info->bytes_may_use) + delalloc_size = space_info->bytes_may_use - block_rsv_size; + + /* + * We don't want to include the global_rsv in our calculation, + * because that's space we can't touch. Subtract it from the + * block_rsv_size for the next checks. + */ + block_rsv_size -= global_rsv_size; + + /* + * We really want to avoid flushing delalloc too much, as it + * could result in poor allocation patterns, so only flush it if + * it's larger than the rest of the pools combined. + */ + if (delalloc_size > block_rsv_size) { + to_reclaim = delalloc_size; + flush = FLUSH_DELALLOC; + } else if (space_info->bytes_pinned > + (delayed_block_rsv->reserved + + delayed_refs_rsv->reserved)) { + to_reclaim = space_info->bytes_pinned; + flush = COMMIT_TRANS; + } else if (delayed_block_rsv->reserved > + delayed_refs_rsv->reserved) { + to_reclaim = delayed_block_rsv->reserved; + flush = FLUSH_DELAYED_ITEMS_NR; + } else { + to_reclaim = delayed_refs_rsv->reserved; + flush = FLUSH_DELAYED_REFS_NR; + } + + spin_unlock(&space_info->lock); + + /* + * We don't want to reclaim everything, just a portion, so scale + * down the to_reclaim by 1/4. If it takes us down to 0, + * reclaim 1 items worth. + */ + to_reclaim >>= 2; + if (!to_reclaim) + to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1); + flush_space(fs_info, space_info, to_reclaim, flush, true); + cond_resched(); + spin_lock(&space_info->lock); + } + + /* We only went through once, back off our clamping. */ + if (loops == 1 && !space_info->reclaim_size) + space_info->clamp = max(1, space_info->clamp - 1); + trace_btrfs_done_preemptive_reclaim(fs_info, space_info); + spin_unlock(&space_info->lock); +} + +/* + * FLUSH_DELALLOC_WAIT: + * Space is freed from flushing delalloc in one of two ways. + * + * 1) compression is on and we allocate less space than we reserved + * 2) we are overwriting existing space + * + * For #1 that extra space is reclaimed as soon as the delalloc pages are + * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent + * length to ->bytes_reserved, and subtracts the reserved space from + * ->bytes_may_use. + * + * For #2 this is trickier. Once the ordered extent runs we will drop the + * extent in the range we are overwriting, which creates a delayed ref for + * that freed extent. This however is not reclaimed until the transaction + * commits, thus the next stages. + * + * RUN_DELAYED_IPUTS + * If we are freeing inodes, we want to make sure all delayed iputs have + * completed, because they could have been on an inode with i_nlink == 0, and + * thus have been truncated and freed up space. But again this space is not + * immediately re-usable, it comes in the form of a delayed ref, which must be + * run and then the transaction must be committed. + * + * COMMIT_TRANS + * This is where we reclaim all of the pinned space generated by running the + * iputs + * + * ALLOC_CHUNK_FORCE + * For data we start with alloc chunk force, however we could have been full + * before, and then the transaction commit could have freed new block groups, + * so if we now have space to allocate do the force chunk allocation. + */ +static const enum btrfs_flush_state data_flush_states[] = { + FLUSH_DELALLOC_FULL, + RUN_DELAYED_IPUTS, + COMMIT_TRANS, + ALLOC_CHUNK_FORCE, +}; + +static void btrfs_async_reclaim_data_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 last_tickets_id; + enum btrfs_flush_state flush_state = 0; + + fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); + space_info = fs_info->data_sinfo; + + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + + while (!space_info->full) { + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + + /* Something happened, fail everything and bail. */ + if (BTRFS_FS_ERROR(fs_info)) + goto aborted_fs; + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + } + + while (flush_state < ARRAY_SIZE(data_flush_states)) { + flush_space(fs_info, space_info, U64_MAX, + data_flush_states[flush_state], false); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + + if (last_tickets_id == space_info->tickets_id) { + flush_state++; + } else { + last_tickets_id = space_info->tickets_id; + flush_state = 0; + } + + if (flush_state >= ARRAY_SIZE(data_flush_states)) { + if (space_info->full) { + if (maybe_fail_all_tickets(fs_info, space_info)) + flush_state = 0; + else + space_info->flush = 0; + } else { + flush_state = 0; + } + + /* Something happened, fail everything and bail. */ + if (BTRFS_FS_ERROR(fs_info)) + goto aborted_fs; + + } + spin_unlock(&space_info->lock); + } + return; + +aborted_fs: + maybe_fail_all_tickets(fs_info, space_info); + space_info->flush = 0; + spin_unlock(&space_info->lock); +} + +void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) +{ + INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); + INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space); + INIT_WORK(&fs_info->preempt_reclaim_work, + btrfs_preempt_reclaim_metadata_space); +} + +static const enum btrfs_flush_state priority_flush_states[] = { + FLUSH_DELAYED_ITEMS_NR, + FLUSH_DELAYED_ITEMS, + ALLOC_CHUNK, +}; + +static const enum btrfs_flush_state evict_flush_states[] = { + FLUSH_DELAYED_ITEMS_NR, + FLUSH_DELAYED_ITEMS, + FLUSH_DELAYED_REFS_NR, + FLUSH_DELAYED_REFS, + FLUSH_DELALLOC, + FLUSH_DELALLOC_WAIT, + FLUSH_DELALLOC_FULL, + ALLOC_CHUNK, + COMMIT_TRANS, +}; + +static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, + const enum btrfs_flush_state *states, + int states_nr) +{ + u64 to_reclaim; + int flush_state = 0; + + spin_lock(&space_info->lock); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); + /* + * This is the priority reclaim path, so to_reclaim could be >0 still + * because we may have only satisfied the priority tickets and still + * left non priority tickets on the list. We would then have + * to_reclaim but ->bytes == 0. + */ + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + + while (flush_state < states_nr) { + spin_unlock(&space_info->lock); + flush_space(fs_info, space_info, to_reclaim, states[flush_state], + false); + flush_state++; + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + } + + /* + * Attempt to steal from the global rsv if we can, except if the fs was + * turned into error mode due to a transaction abort when flushing space + * above, in that case fail with the abort error instead of returning + * success to the caller if we can steal from the global rsv - this is + * just to have caller fail immeditelly instead of later when trying to + * modify the fs, making it easier to debug -ENOSPC problems. + */ + if (BTRFS_FS_ERROR(fs_info)) { + ticket->error = BTRFS_FS_ERROR(fs_info); + remove_ticket(space_info, ticket); + } else if (!steal_from_global_rsv(fs_info, space_info, ticket)) { + ticket->error = -ENOSPC; + remove_ticket(space_info, ticket); + } + + /* + * We must run try_granting_tickets here because we could be a large + * ticket in front of a smaller ticket that can now be satisfied with + * the available space. + */ + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); +} + +static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) +{ + spin_lock(&space_info->lock); + + /* We could have been granted before we got here. */ + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + + while (!space_info->full) { + spin_unlock(&space_info->lock); + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + } + + ticket->error = -ENOSPC; + remove_ticket(space_info, ticket); + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); +} + +static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) + +{ + DEFINE_WAIT(wait); + int ret = 0; + + spin_lock(&space_info->lock); + while (ticket->bytes > 0 && ticket->error == 0) { + ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); + if (ret) { + /* + * Delete us from the list. After we unlock the space + * info, we don't want the async reclaim job to reserve + * space for this ticket. If that would happen, then the + * ticket's task would not known that space was reserved + * despite getting an error, resulting in a space leak + * (bytes_may_use counter of our space_info). + */ + remove_ticket(space_info, ticket); + ticket->error = -EINTR; + break; + } + spin_unlock(&space_info->lock); + + schedule(); + + finish_wait(&ticket->wait, &wait); + spin_lock(&space_info->lock); + } + spin_unlock(&space_info->lock); +} + +/* + * Do the appropriate flushing and waiting for a ticket. + * + * @fs_info: the filesystem + * @space_info: space info for the reservation + * @ticket: ticket for the reservation + * @start_ns: timestamp when the reservation started + * @orig_bytes: amount of bytes originally reserved + * @flush: how much we can flush + * + * This does the work of figuring out how to flush for the ticket, waiting for + * the reservation, and returning the appropriate error if there is one. + */ +static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, + u64 start_ns, u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) +{ + int ret; + + switch (flush) { + case BTRFS_RESERVE_FLUSH_DATA: + case BTRFS_RESERVE_FLUSH_ALL: + case BTRFS_RESERVE_FLUSH_ALL_STEAL: + wait_reserve_ticket(fs_info, space_info, ticket); + break; + case BTRFS_RESERVE_FLUSH_LIMIT: + priority_reclaim_metadata_space(fs_info, space_info, ticket, + priority_flush_states, + ARRAY_SIZE(priority_flush_states)); + break; + case BTRFS_RESERVE_FLUSH_EVICT: + priority_reclaim_metadata_space(fs_info, space_info, ticket, + evict_flush_states, + ARRAY_SIZE(evict_flush_states)); + break; + case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE: + priority_reclaim_data_space(fs_info, space_info, ticket); + break; + default: + ASSERT(0); + break; + } + + ret = ticket->error; + ASSERT(list_empty(&ticket->list)); + /* + * Check that we can't have an error set if the reservation succeeded, + * as that would confuse tasks and lead them to error out without + * releasing reserved space (if an error happens the expectation is that + * space wasn't reserved at all). + */ + ASSERT(!(ticket->bytes == 0 && ticket->error)); + trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes, + start_ns, flush, ticket->error); + return ret; +} + +/* + * This returns true if this flush state will go through the ordinary flushing + * code. + */ +static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) +{ + return (flush == BTRFS_RESERVE_FLUSH_ALL) || + (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); +} + +static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); + u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); + + /* + * If we're heavy on ordered operations then clamping won't help us. We + * need to clamp specifically to keep up with dirty'ing buffered + * writers, because there's not a 1:1 correlation of writing delalloc + * and freeing space, like there is with flushing delayed refs or + * delayed nodes. If we're already more ordered than delalloc then + * we're keeping up, otherwise we aren't and should probably clamp. + */ + if (ordered < delalloc) + space_info->clamp = min(space_info->clamp + 1, 8); +} + +static inline bool can_steal(enum btrfs_reserve_flush_enum flush) +{ + return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || + flush == BTRFS_RESERVE_FLUSH_EVICT); +} + +/* + * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to + * fail as quickly as possible. + */ +static inline bool can_ticket(enum btrfs_reserve_flush_enum flush) +{ + return (flush != BTRFS_RESERVE_NO_FLUSH && + flush != BTRFS_RESERVE_FLUSH_EMERGENCY); +} + +/* + * Try to reserve bytes from the block_rsv's space. + * + * @fs_info: the filesystem + * @space_info: space info we want to allocate from + * @orig_bytes: number of bytes we want + * @flush: whether or not we can flush to make our reservation + * + * This will reserve orig_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +static int __reserve_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) +{ + struct work_struct *async_work; + struct reserve_ticket ticket; + u64 start_ns = 0; + u64 used; + int ret = -ENOSPC; + bool pending_tickets; + + ASSERT(orig_bytes); + /* + * If have a transaction handle (current->journal_info != NULL), then + * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor + * BTRFS_RESERVE_FLUSH_EVICT, as we could deadlock because those + * flushing methods can trigger transaction commits. + */ + if (current->journal_info) { + /* One assert per line for easier debugging. */ + ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL); + ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL); + ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT); + } + + if (flush == BTRFS_RESERVE_FLUSH_DATA) + async_work = &fs_info->async_data_reclaim_work; + else + async_work = &fs_info->async_reclaim_work; + + spin_lock(&space_info->lock); + used = btrfs_space_info_used(space_info, true); + + /* + * We don't want NO_FLUSH allocations to jump everybody, they can + * generally handle ENOSPC in a different way, so treat them the same as + * normal flushers when it comes to skipping pending tickets. + */ + if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH)) + pending_tickets = !list_empty(&space_info->tickets) || + !list_empty(&space_info->priority_tickets); + else + pending_tickets = !list_empty(&space_info->priority_tickets); + + /* + * Carry on if we have enough space (short-circuit) OR call + * can_overcommit() to ensure we can overcommit to continue. + */ + if (!pending_tickets && + ((used + orig_bytes <= space_info->total_bytes) || + btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { + btrfs_space_info_update_bytes_may_use(fs_info, space_info, + orig_bytes); + ret = 0; + } + + /* + * Things are dire, we need to make a reservation so we don't abort. We + * will let this reservation go through as long as we have actual space + * left to allocate for the block. + */ + if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { + used = btrfs_space_info_used(space_info, false); + if (used + orig_bytes <= space_info->total_bytes) { + btrfs_space_info_update_bytes_may_use(fs_info, space_info, + orig_bytes); + ret = 0; + } + } + + /* + * If we couldn't make a reservation then setup our reservation ticket + * and kick the async worker if it's not already running. + * + * If we are a priority flusher then we just need to add our ticket to + * the list and we will do our own flushing further down. + */ + if (ret && can_ticket(flush)) { + ticket.bytes = orig_bytes; + ticket.error = 0; + space_info->reclaim_size += ticket.bytes; + init_waitqueue_head(&ticket.wait); + ticket.steal = can_steal(flush); + if (trace_btrfs_reserve_ticket_enabled()) + start_ns = ktime_get_ns(); + + if (flush == BTRFS_RESERVE_FLUSH_ALL || + flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || + flush == BTRFS_RESERVE_FLUSH_DATA) { + list_add_tail(&ticket.list, &space_info->tickets); + if (!space_info->flush) { + /* + * We were forced to add a reserve ticket, so + * our preemptive flushing is unable to keep + * up. Clamp down on the threshold for the + * preemptive flushing in order to keep up with + * the workload. + */ + maybe_clamp_preempt(fs_info, space_info); + + space_info->flush = 1; + trace_btrfs_trigger_flush(fs_info, + space_info->flags, + orig_bytes, flush, + "enospc"); + queue_work(system_unbound_wq, async_work); + } + } else { + list_add_tail(&ticket.list, + &space_info->priority_tickets); + } + } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + /* + * We will do the space reservation dance during log replay, + * which means we won't have fs_info->fs_root set, so don't do + * the async reclaim as we will panic. + */ + if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && + !work_busy(&fs_info->preempt_reclaim_work) && + need_preemptive_reclaim(fs_info, space_info)) { + trace_btrfs_trigger_flush(fs_info, space_info->flags, + orig_bytes, flush, "preempt"); + queue_work(system_unbound_wq, + &fs_info->preempt_reclaim_work); + } + } + spin_unlock(&space_info->lock); + if (!ret || !can_ticket(flush)) + return ret; + + return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns, + orig_bytes, flush); +} + +/* + * Try to reserve metadata bytes from the block_rsv's space. + * + * @fs_info: the filesystem + * @block_rsv: block_rsv we're allocating for + * @orig_bytes: number of bytes we want + * @flush: whether or not we can flush to make our reservation + * + * This will reserve orig_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) +{ + int ret; + + ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush); + if (ret == -ENOSPC) { + trace_btrfs_space_reservation(fs_info, "space_info:enospc", + block_rsv->space_info->flags, + orig_bytes, 1); + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_dump_space_info(fs_info, block_rsv->space_info, + orig_bytes, 0); + } + return ret; +} + +/* + * Try to reserve data bytes for an allocation. + * + * @fs_info: the filesystem + * @bytes: number of bytes we need + * @flush: how we are allowed to flush + * + * This will reserve bytes from the data space info. If there is not enough + * space then we will attempt to flush space as specified by flush. + */ +int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + int ret; + + ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || + flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE || + flush == BTRFS_RESERVE_NO_FLUSH); + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); + + ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); + if (ret == -ENOSPC) { + trace_btrfs_space_reservation(fs_info, "space_info:enospc", + data_sinfo->flags, bytes, 1); + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0); + } + return ret; +} + +/* Dump all the space infos when we abort a transaction due to ENOSPC. */ +__cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + + btrfs_info(fs_info, "dumping space info:"); + list_for_each_entry(space_info, &fs_info->space_info, list) { + spin_lock(&space_info->lock); + __btrfs_dump_space_info(fs_info, space_info); + spin_unlock(&space_info->lock); + } + dump_global_block_rsv(fs_info); +} + +/* + * Account the unused space of all the readonly block group in the space_info. + * takes mirrors into account. + */ +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) +{ + struct btrfs_block_group *block_group; + u64 free_bytes = 0; + int factor; + + /* It's df, we don't care if it's racy */ + if (list_empty(&sinfo->ro_bgs)) + return 0; + + spin_lock(&sinfo->lock); + list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { + spin_lock(&block_group->lock); + + if (!block_group->ro) { + spin_unlock(&block_group->lock); + continue; + } + + factor = btrfs_bg_type_to_factor(block_group->flags); + free_bytes += (block_group->length - + block_group->used) * factor; + + spin_unlock(&block_group->lock); + } + spin_unlock(&sinfo->lock); + + return free_bytes; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h new file mode 100644 index 0000000000..0bb9d14e60 --- /dev/null +++ b/fs/btrfs/space-info.h @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SPACE_INFO_H +#define BTRFS_SPACE_INFO_H + +#include "volumes.h" + +/* + * Different levels for to flush space when doing space reservations. + * + * The higher the level, the more methods we try to reclaim space. + */ +enum btrfs_reserve_flush_enum { + /* If we are in the transaction, we can't flush anything.*/ + BTRFS_RESERVE_NO_FLUSH, + + /* + * Flush space by: + * - Running delayed inode items + * - Allocating a new chunk + */ + BTRFS_RESERVE_FLUSH_LIMIT, + + /* + * Flush space by: + * - Running delayed inode items + * - Running delayed refs + * - Running delalloc and waiting for ordered extents + * - Allocating a new chunk + * - Committing transaction + */ + BTRFS_RESERVE_FLUSH_EVICT, + + /* + * Flush space by above mentioned methods and by: + * - Running delayed iputs + * - Committing transaction + * + * Can be interrupted by a fatal signal. + */ + BTRFS_RESERVE_FLUSH_DATA, + BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, + BTRFS_RESERVE_FLUSH_ALL, + + /* + * Pretty much the same as FLUSH_ALL, but can also steal space from + * global rsv. + * + * Can be interrupted by a fatal signal. + */ + BTRFS_RESERVE_FLUSH_ALL_STEAL, + + /* + * This is for btrfs_use_block_rsv only. We have exhausted our block + * rsv and our global block rsv. This can happen for things like + * delalloc where we are overwriting a lot of extents with a single + * extent and didn't reserve enough space. Alternatively it can happen + * with delalloc where we reserve 1 extents worth for a large extent but + * fragmentation leads to multiple extents being created. This will + * give us the reservation in the case of + * + * if (num_bytes < (space_info->total_bytes - + * btrfs_space_info_used(space_info, false)) + * + * Which ignores bytes_may_use. This is potentially dangerous, but our + * reservation system is generally pessimistic so is able to absorb this + * style of mistake. + */ + BTRFS_RESERVE_FLUSH_EMERGENCY, +}; + +enum btrfs_flush_state { + FLUSH_DELAYED_ITEMS_NR = 1, + FLUSH_DELAYED_ITEMS = 2, + FLUSH_DELAYED_REFS_NR = 3, + FLUSH_DELAYED_REFS = 4, + FLUSH_DELALLOC = 5, + FLUSH_DELALLOC_WAIT = 6, + FLUSH_DELALLOC_FULL = 7, + ALLOC_CHUNK = 8, + ALLOC_CHUNK_FORCE = 9, + RUN_DELAYED_IPUTS = 10, + COMMIT_TRANS = 11, +}; + +struct btrfs_space_info { + spinlock_t lock; + + u64 total_bytes; /* total bytes in the space, + this doesn't take mirrors into account */ + u64 bytes_used; /* total bytes used, + this doesn't take mirrors into account */ + u64 bytes_pinned; /* total bytes pinned, will be freed when the + transaction finishes */ + u64 bytes_reserved; /* total bytes the allocator has reserved for + current allocations */ + u64 bytes_may_use; /* number of bytes that may be used for + delalloc/allocations */ + u64 bytes_readonly; /* total bytes that are read only */ + u64 bytes_zone_unusable; /* total bytes that are unusable until + resetting the device zone */ + + u64 max_extent_size; /* This will hold the maximum extent size of + the space info if we had an ENOSPC in the + allocator. */ + /* Chunk size in bytes */ + u64 chunk_size; + + /* + * Once a block group drops below this threshold (percents) we'll + * schedule it for reclaim. + */ + int bg_reclaim_threshold; + + int clamp; /* Used to scale our threshold for preemptive + flushing. The value is >> clamp, so turns + out to be a 2^clamp divisor. */ + + unsigned int full:1; /* indicates that we cannot allocate any more + chunks for this space */ + unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + + unsigned int flush:1; /* set if we are trying to make space */ + + unsigned int force_alloc; /* set if we need to force a chunk + alloc for this space */ + + u64 disk_used; /* total bytes used on disk */ + u64 disk_total; /* total bytes on disk, takes mirrors into + account */ + + u64 flags; + + struct list_head list; + /* Protected by the spinlock 'lock'. */ + struct list_head ro_bgs; + struct list_head priority_tickets; + struct list_head tickets; + + /* + * Size of space that needs to be reclaimed in order to satisfy pending + * tickets + */ + u64 reclaim_size; + + /* + * tickets_id just indicates the next ticket will be handled, so note + * it's not stored per ticket. + */ + u64 tickets_id; + + struct rw_semaphore groups_sem; + /* for block groups in our same type */ + struct list_head block_groups[BTRFS_NR_RAID_TYPES]; + + struct kobject kobj; + struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; +}; + +struct reserve_ticket { + u64 bytes; + int error; + bool steal; + struct list_head list; + wait_queue_head_t wait; +}; + +static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) +{ + return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && + (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); +} + +/* + * + * Declare a helper function to detect underflow of various space info members + */ +#define DECLARE_SPACE_INFO_UPDATE(name, trace_name) \ +static inline void \ +btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ + struct btrfs_space_info *sinfo, \ + s64 bytes) \ +{ \ + const u64 abs_bytes = (bytes < 0) ? -bytes : bytes; \ + lockdep_assert_held(&sinfo->lock); \ + trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \ + trace_btrfs_space_reservation(fs_info, trace_name, \ + sinfo->flags, abs_bytes, \ + bytes > 0); \ + if (bytes < 0 && sinfo->name < -bytes) { \ + WARN_ON(1); \ + sinfo->name = 0; \ + return; \ + } \ + sinfo->name += bytes; \ +} + +DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info"); +DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); + +int btrfs_init_space_info(struct btrfs_fs_info *fs_info); +void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, + struct btrfs_block_group *block_group); +void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, + u64 chunk_size); +struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, + u64 flags); +u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, + bool may_use_included); +void btrfs_clear_space_info_full(struct btrfs_fs_info *info); +void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *info, u64 bytes, + int dump_block_groups); +int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush); +void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info); +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush); + +static inline void btrfs_space_info_free_bytes_may_use( + struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + spin_lock(&space_info->lock); + btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); +} +int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, + enum btrfs_reserve_flush_enum flush); +void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info); +void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); + +#endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c new file mode 100644 index 0000000000..1b999c6e41 --- /dev/null +++ b/fs/btrfs/subpage.c @@ -0,0 +1,754 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "messages.h" +#include "ctree.h" +#include "subpage.h" +#include "btrfs_inode.h" + +/* + * Subpage (sectorsize < PAGE_SIZE) support overview: + * + * Limitations: + * + * - Only support 64K page size for now + * This is to make metadata handling easier, as 64K page would ensure + * all nodesize would fit inside one page, thus we don't need to handle + * cases where a tree block crosses several pages. + * + * - Only metadata read-write for now + * The data read-write part is in development. + * + * - Metadata can't cross 64K page boundary + * btrfs-progs and kernel have done that for a while, thus only ancient + * filesystems could have such problem. For such case, do a graceful + * rejection. + * + * Special behavior: + * + * - Metadata + * Metadata read is fully supported. + * Meaning when reading one tree block will only trigger the read for the + * needed range, other unrelated range in the same page will not be touched. + * + * Metadata write support is partial. + * The writeback is still for the full page, but we will only submit + * the dirty extent buffers in the page. + * + * This means, if we have a metadata page like this: + * + * Page offset + * 0 16K 32K 48K 64K + * |/////////| |///////////| + * \- Tree block A \- Tree block B + * + * Even if we just want to writeback tree block A, we will also writeback + * tree block B if it's also dirty. + * + * This may cause extra metadata writeback which results more COW. + * + * Implementation: + * + * - Common + * Both metadata and data will use a new structure, btrfs_subpage, to + * record the status of each sector inside a page. This provides the extra + * granularity needed. + * + * - Metadata + * Since we have multiple tree blocks inside one page, we can't rely on page + * locking anymore, or we will have greatly reduced concurrency or even + * deadlocks (hold one tree lock while trying to lock another tree lock in + * the same page). + * + * Thus for metadata locking, subpage support relies on io_tree locking only. + * This means a slightly higher tree locking latency. + */ + +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page) +{ + if (fs_info->sectorsize >= PAGE_SIZE) + return false; + + /* + * Only data pages (either through DIO or compression) can have no + * mapping. And if page->mapping->host is data inode, it's subpage. + * As we have ruled our sectorsize >= PAGE_SIZE case already. + */ + if (!page->mapping || !page->mapping->host || + is_data_inode(page->mapping->host)) + return true; + + /* + * Now the only remaining case is metadata, which we only go subpage + * routine if nodesize < PAGE_SIZE. + */ + if (fs_info->nodesize < PAGE_SIZE) + return true; + return false; +} + +void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) +{ + unsigned int cur = 0; + unsigned int nr_bits; + + ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize)); + + nr_bits = PAGE_SIZE / sectorsize; + subpage_info->bitmap_nr_bits = nr_bits; + + subpage_info->uptodate_offset = cur; + cur += nr_bits; + + subpage_info->dirty_offset = cur; + cur += nr_bits; + + subpage_info->writeback_offset = cur; + cur += nr_bits; + + subpage_info->ordered_offset = cur; + cur += nr_bits; + + subpage_info->checked_offset = cur; + cur += nr_bits; + + subpage_info->total_nr_bits = cur; +} + +int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page, enum btrfs_subpage_type type) +{ + struct btrfs_subpage *subpage; + + /* + * We have cases like a dummy extent buffer page, which is not mapped + * and doesn't need to be locked. + */ + if (page->mapping) + ASSERT(PageLocked(page)); + + /* Either not subpage, or the page already has private attached */ + if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page)) + return 0; + + subpage = btrfs_alloc_subpage(fs_info, type); + if (IS_ERR(subpage)) + return PTR_ERR(subpage); + + attach_page_private(page, subpage); + return 0; +} + +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage; + + /* Either not subpage, or already detached */ + if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page)) + return; + + subpage = detach_page_private(page); + ASSERT(subpage); + btrfs_free_subpage(subpage); +} + +struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, + enum btrfs_subpage_type type) +{ + struct btrfs_subpage *ret; + unsigned int real_size; + + ASSERT(fs_info->sectorsize < PAGE_SIZE); + + real_size = struct_size(ret, bitmaps, + BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits)); + ret = kzalloc(real_size, GFP_NOFS); + if (!ret) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&ret->lock); + if (type == BTRFS_SUBPAGE_METADATA) { + atomic_set(&ret->eb_refs, 0); + } else { + atomic_set(&ret->readers, 0); + atomic_set(&ret->writers, 0); + } + return ret; +} + +void btrfs_free_subpage(struct btrfs_subpage *subpage) +{ + kfree(subpage); +} + +/* + * Increase the eb_refs of current subpage. + * + * This is important for eb allocation, to prevent race with last eb freeing + * of the same page. + * With the eb_refs increased before the eb inserted into radix tree, + * detach_extent_buffer_page() won't detach the page private while we're still + * allocating the extent buffer. + */ +void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage; + + if (!btrfs_is_subpage(fs_info, page)) + return; + + ASSERT(PagePrivate(page) && page->mapping); + lockdep_assert_held(&page->mapping->private_lock); + + subpage = (struct btrfs_subpage *)page->private; + atomic_inc(&subpage->eb_refs); +} + +void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage; + + if (!btrfs_is_subpage(fs_info, page)) + return; + + ASSERT(PagePrivate(page) && page->mapping); + lockdep_assert_held(&page->mapping->private_lock); + + subpage = (struct btrfs_subpage *)page->private; + ASSERT(atomic_read(&subpage->eb_refs)); + atomic_dec(&subpage->eb_refs); +} + +static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + /* Basic checks */ + ASSERT(PagePrivate(page) && page->private); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(len, fs_info->sectorsize)); + /* + * The range check only works for mapped page, we can still have + * unmapped page like dummy extent buffer pages. + */ + if (page->mapping) + ASSERT(page_offset(page) <= start && + start + len <= page_offset(page) + PAGE_SIZE); +} + +void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = len >> fs_info->sectorsize_bits; + + btrfs_subpage_assert(fs_info, page, start, len); + + atomic_add(nbits, &subpage->readers); +} + +void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = len >> fs_info->sectorsize_bits; + bool is_data; + bool last; + + btrfs_subpage_assert(fs_info, page, start, len); + is_data = is_data_inode(page->mapping->host); + ASSERT(atomic_read(&subpage->readers) >= nbits); + last = atomic_sub_and_test(nbits, &subpage->readers); + + /* + * For data we need to unlock the page if the last read has finished. + * + * And please don't replace @last with atomic_sub_and_test() call + * inside if () condition. + * As we want the atomic_sub_and_test() to be always executed. + */ + if (is_data && last) + unlock_page(page); +} + +static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) +{ + u64 orig_start = *start; + u32 orig_len = *len; + + *start = max_t(u64, page_offset(page), orig_start); + /* + * For certain call sites like btrfs_drop_pages(), we may have pages + * beyond the target range. In that case, just set @len to 0, subpage + * helpers can handle @len == 0 without any problem. + */ + if (page_offset(page) >= orig_start + orig_len) + *len = 0; + else + *len = min_t(u64, page_offset(page) + PAGE_SIZE, + orig_start + orig_len) - *start; +} + +void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = (len >> fs_info->sectorsize_bits); + int ret; + + btrfs_subpage_assert(fs_info, page, start, len); + + ASSERT(atomic_read(&subpage->readers) == 0); + ret = atomic_add_return(nbits, &subpage->writers); + ASSERT(ret == nbits); +} + +bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = (len >> fs_info->sectorsize_bits); + + btrfs_subpage_assert(fs_info, page, start, len); + + /* + * We have call sites passing @lock_page into + * extent_clear_unlock_delalloc() for compression path. + * + * This @locked_page is locked by plain lock_page(), thus its + * subpage::writers is 0. Handle them in a special way. + */ + if (atomic_read(&subpage->writers) == 0) + return true; + + ASSERT(atomic_read(&subpage->writers) >= nbits); + return atomic_sub_and_test(nbits, &subpage->writers); +} + +/* + * Lock a page for delalloc page writeback. + * + * Return -EAGAIN if the page is not properly initialized. + * Return 0 with the page locked, and writer counter updated. + * + * Even with 0 returned, the page still need extra check to make sure + * it's really the correct page, as the caller is using + * filemap_get_folios_contig(), which can race with page invalidating. + */ +int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { + lock_page(page); + return 0; + } + lock_page(page); + if (!PagePrivate(page) || !page->private) { + unlock_page(page); + return -EAGAIN; + } + btrfs_subpage_clamp_range(page, &start, &len); + btrfs_subpage_start_writer(fs_info, page, start, len); + return 0; +} + +void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) + return unlock_page(page); + btrfs_subpage_clamp_range(page, &start, &len); + if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) + unlock_page(page); +} + +#define subpage_calc_start_bit(fs_info, page, name, start, len) \ +({ \ + unsigned int start_bit; \ + \ + btrfs_subpage_assert(fs_info, page, start, len); \ + start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ + start_bit += fs_info->subpage_info->name##_offset; \ + start_bit; \ +}) + +#define subpage_test_bitmap_all_set(fs_info, subpage, name) \ + bitmap_test_range_all_set(subpage->bitmaps, \ + fs_info->subpage_info->name##_offset, \ + fs_info->subpage_info->bitmap_nr_bits) + +#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \ + bitmap_test_range_all_zero(subpage->bitmaps, \ + fs_info->subpage_info->name##_offset, \ + fs_info->subpage_info->bitmap_nr_bits) + +void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + uptodate, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate)) + SetPageUptodate(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + uptodate, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + ClearPageUptodate(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + dirty, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_unlock_irqrestore(&subpage->lock, flags); + set_page_dirty(page); +} + +/* + * Extra clear_and_test function for subpage dirty bitmap. + * + * Return true if we're the last bits in the dirty_bitmap and clear the + * dirty_bitmap. + * Return false otherwise. + * + * NOTE: Callers should manually clear page dirty for true case, as we have + * extra handling for tree blocks. + */ +bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + dirty, start, len); + unsigned long flags; + bool last = false; + + spin_lock_irqsave(&subpage->lock, flags); + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty)) + last = true; + spin_unlock_irqrestore(&subpage->lock, flags); + return last; +} + +void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + bool last; + + last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len); + if (last) + clear_page_dirty_for_io(page); +} + +void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + writeback, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + set_page_writeback(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + writeback, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) { + ASSERT(PageWriteback(page)); + end_page_writeback(page); + } + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + ordered, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + SetPageOrdered(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + ordered, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered)) + ClearPageOrdered(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + checked, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_set(fs_info, subpage, checked)) + SetPageChecked(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + checked, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + ClearPageChecked(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +/* + * Unlike set/clear which is dependent on each page status, for test all bits + * are tested in the same way. + */ +#define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ +bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \ + name, start, len); \ + unsigned long flags; \ + bool ret; \ + \ + spin_lock_irqsave(&subpage->lock, flags); \ + ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit, \ + len >> fs_info->sectorsize_bits); \ + spin_unlock_irqrestore(&subpage->lock, flags); \ + return ret; \ +} +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked); + +/* + * Note that, in selftests (extent-io-tests), we can have empty fs_info passed + * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall + * back to regular sectorsize branch. + */ +#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \ + test_page_func) \ +void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ + set_page_func(page); \ + return; \ + } \ + btrfs_subpage_set_##name(fs_info, page, start, len); \ +} \ +void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ + clear_page_func(page); \ + return; \ + } \ + btrfs_subpage_clear_##name(fs_info, page, start, len); \ +} \ +bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ + return test_page_func(page); \ + return btrfs_subpage_test_##name(fs_info, page, start, len); \ +} \ +void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ + set_page_func(page); \ + return; \ + } \ + btrfs_subpage_clamp_range(page, &start, &len); \ + btrfs_subpage_set_##name(fs_info, page, start, len); \ +} \ +void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ + clear_page_func(page); \ + return; \ + } \ + btrfs_subpage_clamp_range(page, &start, &len); \ + btrfs_subpage_clear_##name(fs_info, page, start, len); \ +} \ +bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ + return test_page_func(page); \ + btrfs_subpage_clamp_range(page, &start, &len); \ + return btrfs_subpage_test_##name(fs_info, page, start, len); \ +} +IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, + PageUptodate); +IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, + PageDirty); +IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, + PageWriteback); +IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, + PageOrdered); +IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked); + +/* + * Make sure not only the page dirty bit is cleared, but also subpage dirty bit + * is cleared. + */ +void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + ASSERT(!PageDirty(page)); + if (!btrfs_is_subpage(fs_info, page)) + return; + + ASSERT(PagePrivate(page) && page->private); + ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty)); +} + +/* + * Handle different locked pages with different page sizes: + * + * - Page locked by plain lock_page() + * It should not have any subpage::writers count. + * Can be unlocked by unlock_page(). + * This is the most common locked page for __extent_writepage() called + * inside extent_write_cache_pages(). + * Rarer cases include the @locked_page from extent_write_locked_range(). + * + * - Page locked by lock_delalloc_pages() + * There is only one caller, all pages except @locked_page for + * extent_write_locked_range(). + * In this case, we have to call subpage helper to handle the case. + */ +void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, + u64 start, u32 len) +{ + struct btrfs_subpage *subpage; + + ASSERT(PageLocked(page)); + /* For non-subpage case, we just unlock the page */ + if (!btrfs_is_subpage(fs_info, page)) + return unlock_page(page); + + ASSERT(PagePrivate(page) && page->private); + subpage = (struct btrfs_subpage *)page->private; + + /* + * For subpage case, there are two types of locked page. With or + * without writers number. + * + * Since we own the page lock, no one else could touch subpage::writers + * and we are safe to do several atomic operations without spinlock. + */ + if (atomic_read(&subpage->writers) == 0) + /* No writers, locked by plain lock_page() */ + return unlock_page(page); + + /* Have writers, use proper subpage helper to end it */ + btrfs_page_end_writer_lock(fs_info, page, start, len); +} + +#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \ + bitmap_cut(dst, subpage->bitmaps, 0, \ + subpage_info->name##_offset, subpage_info->bitmap_nr_bits) + +void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; + struct btrfs_subpage *subpage; + unsigned long uptodate_bitmap; + unsigned long error_bitmap; + unsigned long dirty_bitmap; + unsigned long writeback_bitmap; + unsigned long ordered_bitmap; + unsigned long checked_bitmap; + unsigned long flags; + + ASSERT(PagePrivate(page) && page->private); + ASSERT(subpage_info); + subpage = (struct btrfs_subpage *)page->private; + + spin_lock_irqsave(&subpage->lock, flags); + GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, &dirty_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap); + spin_unlock_irqrestore(&subpage->lock, flags); + + dump_page(page, "btrfs subpage dump"); + btrfs_warn(fs_info, +"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", + start, len, page_offset(page), + subpage_info->bitmap_nr_bits, &uptodate_bitmap, + subpage_info->bitmap_nr_bits, &error_bitmap, + subpage_info->bitmap_nr_bits, &dirty_bitmap, + subpage_info->bitmap_nr_bits, &writeback_bitmap, + subpage_info->bitmap_nr_bits, &ordered_bitmap, + subpage_info->bitmap_nr_bits, &checked_bitmap); +} diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h new file mode 100644 index 0000000000..5cbf67ccbd --- /dev/null +++ b/fs/btrfs/subpage.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SUBPAGE_H +#define BTRFS_SUBPAGE_H + +#include + +/* + * Extra info for subpapge bitmap. + * + * For subpage we pack all uptodate/dirty/writeback/ordered bitmaps into + * one larger bitmap. + * + * This structure records how they are organized in the bitmap: + * + * /- uptodate_offset /- dirty_offset /- ordered_offset + * | | | + * v v v + * |u|u|u|u|........|u|u|d|d|.......|d|d|o|o|.......|o|o| + * |<- bitmap_nr_bits ->| + * |<----------------- total_nr_bits ------------------>| + */ +struct btrfs_subpage_info { + /* Number of bits for each bitmap */ + unsigned int bitmap_nr_bits; + + /* Total number of bits for the whole bitmap */ + unsigned int total_nr_bits; + + /* + * *_start indicates where the bitmap starts, the length is always + * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize. + */ + unsigned int uptodate_offset; + unsigned int dirty_offset; + unsigned int writeback_offset; + unsigned int ordered_offset; + unsigned int checked_offset; +}; + +/* + * Structure to trace status of each sector inside a page, attached to + * page::private for both data and metadata inodes. + */ +struct btrfs_subpage { + /* Common members for both data and metadata pages */ + spinlock_t lock; + /* + * Both data and metadata needs to track how many readers are for the + * page. + * Data relies on @readers to unlock the page when last reader finished. + * While metadata doesn't need page unlock, it needs to prevent + * page::private get cleared before the last end_page_read(). + */ + atomic_t readers; + union { + /* + * Structures only used by metadata + * + * @eb_refs should only be operated under private_lock, as it + * manages whether the subpage can be detached. + */ + atomic_t eb_refs; + + /* Structures only used by data */ + atomic_t writers; + }; + unsigned long bitmaps[]; +}; + +enum btrfs_subpage_type { + BTRFS_SUBPAGE_METADATA, + BTRFS_SUBPAGE_DATA, +}; + +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page); + +void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); +int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page, enum btrfs_subpage_type type); +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page); + +/* Allocate additional data where page represents more than one sector */ +struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, + enum btrfs_subpage_type type); +void btrfs_free_subpage(struct btrfs_subpage *subpage); + +void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page); +void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page); + +void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); +void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); + +void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); +bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); +int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); +void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); + +/* + * Template for subpage related operations. + * + * btrfs_subpage_*() are for call sites where the page has subpage attached and + * the range is ensured to be inside the page. + * + * btrfs_page_*() are for call sites where the page can either be subpage + * specific or regular page. The function will handle both cases. + * But the range still needs to be inside the page. + * + * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't + * need to be inside the page. Those functions will truncate the range + * automatically. + */ +#define DECLARE_BTRFS_SUBPAGE_OPS(name) \ +void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); + +DECLARE_BTRFS_SUBPAGE_OPS(uptodate); +DECLARE_BTRFS_SUBPAGE_OPS(dirty); +DECLARE_BTRFS_SUBPAGE_OPS(writeback); +DECLARE_BTRFS_SUBPAGE_OPS(ordered); +DECLARE_BTRFS_SUBPAGE_OPS(checked); + +bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); + +void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, + struct page *page); +void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, + u64 start, u32 len); +void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); + +#endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c new file mode 100644 index 0000000000..de0bfebce1 --- /dev/null +++ b/fs/btrfs/super.c @@ -0,0 +1,2557 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "messages.h" +#include "delayed-inode.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "props.h" +#include "xattr.h" +#include "bio.h" +#include "export.h" +#include "compression.h" +#include "rcu-string.h" +#include "dev-replace.h" +#include "free-space-cache.h" +#include "backref.h" +#include "space-info.h" +#include "sysfs.h" +#include "zoned.h" +#include "tests/btrfs-tests.h" +#include "block-group.h" +#include "discard.h" +#include "qgroup.h" +#include "raid56.h" +#include "fs.h" +#include "accessors.h" +#include "defrag.h" +#include "dir-item.h" +#include "ioctl.h" +#include "scrub.h" +#include "verity.h" +#include "super.h" +#include "extent-tree.h" +#define CREATE_TRACE_POINTS +#include + +static const struct super_operations btrfs_super_ops; + +/* + * Types for mounting the default subvolume and a subvolume explicitly + * requested by subvol=/path. That way the callchain is straightforward and we + * don't have to play tricks with the mount options and recursive calls to + * btrfs_mount. + * + * The new btrfs_root_fs_type also servers as a tag for the bdev_holder. + */ +static struct file_system_type btrfs_fs_type; +static struct file_system_type btrfs_root_fs_type; + +static int btrfs_remount(struct super_block *sb, int *flags, char *data); + +static void btrfs_put_super(struct super_block *sb) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + btrfs_info(fs_info, "last unmount of filesystem %pU", fs_info->fs_devices->fsid); + close_ctree(fs_info); +} + +enum { + Opt_acl, Opt_noacl, + Opt_clear_cache, + Opt_commit_interval, + Opt_compress, + Opt_compress_force, + Opt_compress_force_type, + Opt_compress_type, + Opt_degraded, + Opt_device, + Opt_fatal_errors, + Opt_flushoncommit, Opt_noflushoncommit, + Opt_max_inline, + Opt_barrier, Opt_nobarrier, + Opt_datacow, Opt_nodatacow, + Opt_datasum, Opt_nodatasum, + Opt_defrag, Opt_nodefrag, + Opt_discard, Opt_nodiscard, + Opt_discard_mode, + Opt_norecovery, + Opt_ratio, + Opt_rescan_uuid_tree, + Opt_skip_balance, + Opt_space_cache, Opt_no_space_cache, + Opt_space_cache_version, + Opt_ssd, Opt_nossd, + Opt_ssd_spread, Opt_nossd_spread, + Opt_subvol, + Opt_subvol_empty, + Opt_subvolid, + Opt_thread_pool, + Opt_treelog, Opt_notreelog, + Opt_user_subvol_rm_allowed, + + /* Rescue options */ + Opt_rescue, + Opt_usebackuproot, + Opt_nologreplay, + Opt_ignorebadroots, + Opt_ignoredatacsums, + Opt_rescue_all, + + /* Deprecated options */ + Opt_recovery, + Opt_inode_cache, Opt_noinode_cache, + + /* Debugging options */ + Opt_check_integrity, + Opt_check_integrity_including_extent_data, + Opt_check_integrity_print_mask, + Opt_enospc_debug, Opt_noenospc_debug, +#ifdef CONFIG_BTRFS_DEBUG + Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, +#endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + Opt_ref_verify, +#endif + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_clear_cache, "clear_cache"}, + {Opt_commit_interval, "commit=%u"}, + {Opt_compress, "compress"}, + {Opt_compress_type, "compress=%s"}, + {Opt_compress_force, "compress-force"}, + {Opt_compress_force_type, "compress-force=%s"}, + {Opt_degraded, "degraded"}, + {Opt_device, "device=%s"}, + {Opt_fatal_errors, "fatal_errors=%s"}, + {Opt_flushoncommit, "flushoncommit"}, + {Opt_noflushoncommit, "noflushoncommit"}, + {Opt_inode_cache, "inode_cache"}, + {Opt_noinode_cache, "noinode_cache"}, + {Opt_max_inline, "max_inline=%s"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_datacow, "datacow"}, + {Opt_nodatacow, "nodatacow"}, + {Opt_datasum, "datasum"}, + {Opt_nodatasum, "nodatasum"}, + {Opt_defrag, "autodefrag"}, + {Opt_nodefrag, "noautodefrag"}, + {Opt_discard, "discard"}, + {Opt_discard_mode, "discard=%s"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_norecovery, "norecovery"}, + {Opt_ratio, "metadata_ratio=%u"}, + {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, + {Opt_skip_balance, "skip_balance"}, + {Opt_space_cache, "space_cache"}, + {Opt_no_space_cache, "nospace_cache"}, + {Opt_space_cache_version, "space_cache=%s"}, + {Opt_ssd, "ssd"}, + {Opt_nossd, "nossd"}, + {Opt_ssd_spread, "ssd_spread"}, + {Opt_nossd_spread, "nossd_spread"}, + {Opt_subvol, "subvol=%s"}, + {Opt_subvol_empty, "subvol="}, + {Opt_subvolid, "subvolid=%s"}, + {Opt_thread_pool, "thread_pool=%u"}, + {Opt_treelog, "treelog"}, + {Opt_notreelog, "notreelog"}, + {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, + + /* Rescue options */ + {Opt_rescue, "rescue=%s"}, + /* Deprecated, with alias rescue=nologreplay */ + {Opt_nologreplay, "nologreplay"}, + /* Deprecated, with alias rescue=usebackuproot */ + {Opt_usebackuproot, "usebackuproot"}, + + /* Deprecated options */ + {Opt_recovery, "recovery"}, + + /* Debugging options */ + {Opt_check_integrity, "check_int"}, + {Opt_check_integrity_including_extent_data, "check_int_data"}, + {Opt_check_integrity_print_mask, "check_int_print_mask=%u"}, + {Opt_enospc_debug, "enospc_debug"}, + {Opt_noenospc_debug, "noenospc_debug"}, +#ifdef CONFIG_BTRFS_DEBUG + {Opt_fragment_data, "fragment=data"}, + {Opt_fragment_metadata, "fragment=metadata"}, + {Opt_fragment_all, "fragment=all"}, +#endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + {Opt_ref_verify, "ref_verify"}, +#endif + {Opt_err, NULL}, +}; + +static const match_table_t rescue_tokens = { + {Opt_usebackuproot, "usebackuproot"}, + {Opt_nologreplay, "nologreplay"}, + {Opt_ignorebadroots, "ignorebadroots"}, + {Opt_ignorebadroots, "ibadroots"}, + {Opt_ignoredatacsums, "ignoredatacsums"}, + {Opt_ignoredatacsums, "idatacsums"}, + {Opt_rescue_all, "all"}, + {Opt_err, NULL}, +}; + +static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt, + const char *opt_name) +{ + if (fs_info->mount_opt & opt) { + btrfs_err(fs_info, "%s must be used with ro mount option", + opt_name); + return true; + } + return false; +} + +static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) +{ + char *opts; + char *orig; + char *p; + substring_t args[MAX_OPT_ARGS]; + int ret = 0; + + opts = kstrdup(options, GFP_KERNEL); + if (!opts) + return -ENOMEM; + orig = opts; + + while ((p = strsep(&opts, ":")) != NULL) { + int token; + + if (!*p) + continue; + token = match_token(p, rescue_tokens, args); + switch (token){ + case Opt_usebackuproot: + btrfs_info(info, + "trying to use backup root at mount time"); + btrfs_set_opt(info->mount_opt, USEBACKUPROOT); + break; + case Opt_nologreplay: + btrfs_set_and_info(info, NOLOGREPLAY, + "disabling log replay at mount time"); + break; + case Opt_ignorebadroots: + btrfs_set_and_info(info, IGNOREBADROOTS, + "ignoring bad roots"); + break; + case Opt_ignoredatacsums: + btrfs_set_and_info(info, IGNOREDATACSUMS, + "ignoring data csums"); + break; + case Opt_rescue_all: + btrfs_info(info, "enabling all of the rescue options"); + btrfs_set_and_info(info, IGNOREDATACSUMS, + "ignoring data csums"); + btrfs_set_and_info(info, IGNOREBADROOTS, + "ignoring bad roots"); + btrfs_set_and_info(info, NOLOGREPLAY, + "disabling log replay at mount time"); + break; + case Opt_err: + btrfs_info(info, "unrecognized rescue option '%s'", p); + ret = -EINVAL; + goto out; + default: + break; + } + + } +out: + kfree(orig); + return ret; +} + +/* + * Regular mount options parser. Everything that is needed only when + * reading in a new superblock is parsed here. + * XXX JDM: This needs to be cleaned up for remount. + */ +int btrfs_parse_options(struct btrfs_fs_info *info, char *options, + unsigned long new_flags) +{ + substring_t args[MAX_OPT_ARGS]; + char *p, *num; + int intarg; + int ret = 0; + char *compress_type; + bool compress_force = false; + enum btrfs_compression_type saved_compress_type; + int saved_compress_level; + bool saved_compress_force; + int no_compress = 0; + const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state); + + if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) + btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); + else if (btrfs_free_space_cache_v1_active(info)) { + if (btrfs_is_zoned(info)) { + btrfs_info(info, + "zoned: clearing existing space cache"); + btrfs_set_super_cache_generation(info->super_copy, 0); + } else { + btrfs_set_opt(info->mount_opt, SPACE_CACHE); + } + } + + /* + * Even the options are empty, we still need to do extra check + * against new flags + */ + if (!options) + goto check; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_degraded: + btrfs_info(info, "allowing degraded mounts"); + btrfs_set_opt(info->mount_opt, DEGRADED); + break; + case Opt_subvol: + case Opt_subvol_empty: + case Opt_subvolid: + case Opt_device: + /* + * These are parsed by btrfs_parse_subvol_options or + * btrfs_parse_device_options and can be ignored here. + */ + break; + case Opt_nodatasum: + btrfs_set_and_info(info, NODATASUM, + "setting nodatasum"); + break; + case Opt_datasum: + if (btrfs_test_opt(info, NODATASUM)) { + if (btrfs_test_opt(info, NODATACOW)) + btrfs_info(info, + "setting datasum, datacow enabled"); + else + btrfs_info(info, "setting datasum"); + } + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); + break; + case Opt_nodatacow: + if (!btrfs_test_opt(info, NODATACOW)) { + if (!btrfs_test_opt(info, COMPRESS) || + !btrfs_test_opt(info, FORCE_COMPRESS)) { + btrfs_info(info, + "setting nodatacow, compression disabled"); + } else { + btrfs_info(info, "setting nodatacow"); + } + } + btrfs_clear_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); + btrfs_set_opt(info->mount_opt, NODATACOW); + btrfs_set_opt(info->mount_opt, NODATASUM); + break; + case Opt_datacow: + btrfs_clear_and_info(info, NODATACOW, + "setting datacow"); + break; + case Opt_compress_force: + case Opt_compress_force_type: + compress_force = true; + fallthrough; + case Opt_compress: + case Opt_compress_type: + saved_compress_type = btrfs_test_opt(info, + COMPRESS) ? + info->compress_type : BTRFS_COMPRESS_NONE; + saved_compress_force = + btrfs_test_opt(info, FORCE_COMPRESS); + saved_compress_level = info->compress_level; + if (token == Opt_compress || + token == Opt_compress_force || + strncmp(args[0].from, "zlib", 4) == 0) { + compress_type = "zlib"; + + info->compress_type = BTRFS_COMPRESS_ZLIB; + info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; + /* + * args[0] contains uninitialized data since + * for these tokens we don't expect any + * parameter. + */ + if (token != Opt_compress && + token != Opt_compress_force) + info->compress_level = + btrfs_compress_str2level( + BTRFS_COMPRESS_ZLIB, + args[0].from + 4); + btrfs_set_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); + no_compress = 0; + } else if (strncmp(args[0].from, "lzo", 3) == 0) { + compress_type = "lzo"; + info->compress_type = BTRFS_COMPRESS_LZO; + info->compress_level = 0; + btrfs_set_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); + btrfs_set_fs_incompat(info, COMPRESS_LZO); + no_compress = 0; + } else if (strncmp(args[0].from, "zstd", 4) == 0) { + compress_type = "zstd"; + info->compress_type = BTRFS_COMPRESS_ZSTD; + info->compress_level = + btrfs_compress_str2level( + BTRFS_COMPRESS_ZSTD, + args[0].from + 4); + btrfs_set_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); + btrfs_set_fs_incompat(info, COMPRESS_ZSTD); + no_compress = 0; + } else if (strncmp(args[0].from, "no", 2) == 0) { + compress_type = "no"; + info->compress_level = 0; + info->compress_type = 0; + btrfs_clear_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); + compress_force = false; + no_compress++; + } else { + btrfs_err(info, "unrecognized compression value %s", + args[0].from); + ret = -EINVAL; + goto out; + } + + if (compress_force) { + btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); + } else { + /* + * If we remount from compress-force=xxx to + * compress=xxx, we need clear FORCE_COMPRESS + * flag, otherwise, there is no way for users + * to disable forcible compression separately. + */ + btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); + } + if (no_compress == 1) { + btrfs_info(info, "use no compression"); + } else if ((info->compress_type != saved_compress_type) || + (compress_force != saved_compress_force) || + (info->compress_level != saved_compress_level)) { + btrfs_info(info, "%s %s compression, level %d", + (compress_force) ? "force" : "use", + compress_type, info->compress_level); + } + compress_force = false; + break; + case Opt_ssd: + btrfs_set_and_info(info, SSD, + "enabling ssd optimizations"); + btrfs_clear_opt(info->mount_opt, NOSSD); + break; + case Opt_ssd_spread: + btrfs_set_and_info(info, SSD, + "enabling ssd optimizations"); + btrfs_set_and_info(info, SSD_SPREAD, + "using spread ssd allocation scheme"); + btrfs_clear_opt(info->mount_opt, NOSSD); + break; + case Opt_nossd: + btrfs_set_opt(info->mount_opt, NOSSD); + btrfs_clear_and_info(info, SSD, + "not using ssd optimizations"); + fallthrough; + case Opt_nossd_spread: + btrfs_clear_and_info(info, SSD_SPREAD, + "not using spread ssd allocation scheme"); + break; + case Opt_barrier: + btrfs_clear_and_info(info, NOBARRIER, + "turning on barriers"); + break; + case Opt_nobarrier: + btrfs_set_and_info(info, NOBARRIER, + "turning off barriers"); + break; + case Opt_thread_pool: + ret = match_int(&args[0], &intarg); + if (ret) { + btrfs_err(info, "unrecognized thread_pool value %s", + args[0].from); + goto out; + } else if (intarg == 0) { + btrfs_err(info, "invalid value 0 for thread_pool"); + ret = -EINVAL; + goto out; + } + info->thread_pool_size = intarg; + break; + case Opt_max_inline: + num = match_strdup(&args[0]); + if (num) { + info->max_inline = memparse(num, NULL); + kfree(num); + + if (info->max_inline) { + info->max_inline = min_t(u64, + info->max_inline, + info->sectorsize); + } + btrfs_info(info, "max_inline at %llu", + info->max_inline); + } else { + ret = -ENOMEM; + goto out; + } + break; + case Opt_acl: +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + info->sb->s_flags |= SB_POSIXACL; + break; +#else + btrfs_err(info, "support for ACL not compiled in!"); + ret = -EINVAL; + goto out; +#endif + case Opt_noacl: + info->sb->s_flags &= ~SB_POSIXACL; + break; + case Opt_notreelog: + btrfs_set_and_info(info, NOTREELOG, + "disabling tree log"); + break; + case Opt_treelog: + btrfs_clear_and_info(info, NOTREELOG, + "enabling tree log"); + break; + case Opt_norecovery: + case Opt_nologreplay: + btrfs_warn(info, + "'nologreplay' is deprecated, use 'rescue=nologreplay' instead"); + btrfs_set_and_info(info, NOLOGREPLAY, + "disabling log replay at mount time"); + break; + case Opt_flushoncommit: + btrfs_set_and_info(info, FLUSHONCOMMIT, + "turning on flush-on-commit"); + break; + case Opt_noflushoncommit: + btrfs_clear_and_info(info, FLUSHONCOMMIT, + "turning off flush-on-commit"); + break; + case Opt_ratio: + ret = match_int(&args[0], &intarg); + if (ret) { + btrfs_err(info, "unrecognized metadata_ratio value %s", + args[0].from); + goto out; + } + info->metadata_ratio = intarg; + btrfs_info(info, "metadata ratio %u", + info->metadata_ratio); + break; + case Opt_discard: + case Opt_discard_mode: + if (token == Opt_discard || + strcmp(args[0].from, "sync") == 0) { + btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC); + btrfs_set_and_info(info, DISCARD_SYNC, + "turning on sync discard"); + } else if (strcmp(args[0].from, "async") == 0) { + btrfs_clear_opt(info->mount_opt, DISCARD_SYNC); + btrfs_set_and_info(info, DISCARD_ASYNC, + "turning on async discard"); + } else { + btrfs_err(info, "unrecognized discard mode value %s", + args[0].from); + ret = -EINVAL; + goto out; + } + btrfs_clear_opt(info->mount_opt, NODISCARD); + break; + case Opt_nodiscard: + btrfs_clear_and_info(info, DISCARD_SYNC, + "turning off discard"); + btrfs_clear_and_info(info, DISCARD_ASYNC, + "turning off async discard"); + btrfs_set_opt(info->mount_opt, NODISCARD); + break; + case Opt_space_cache: + case Opt_space_cache_version: + /* + * We already set FREE_SPACE_TREE above because we have + * compat_ro(FREE_SPACE_TREE) set, and we aren't going + * to allow v1 to be set for extent tree v2, simply + * ignore this setting if we're extent tree v2. + */ + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) + break; + if (token == Opt_space_cache || + strcmp(args[0].from, "v1") == 0) { + btrfs_clear_opt(info->mount_opt, + FREE_SPACE_TREE); + btrfs_set_and_info(info, SPACE_CACHE, + "enabling disk space caching"); + } else if (strcmp(args[0].from, "v2") == 0) { + btrfs_clear_opt(info->mount_opt, + SPACE_CACHE); + btrfs_set_and_info(info, FREE_SPACE_TREE, + "enabling free space tree"); + } else { + btrfs_err(info, "unrecognized space_cache value %s", + args[0].from); + ret = -EINVAL; + goto out; + } + break; + case Opt_rescan_uuid_tree: + btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); + break; + case Opt_no_space_cache: + /* + * We cannot operate without the free space tree with + * extent tree v2, ignore this option. + */ + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) + break; + if (btrfs_test_opt(info, SPACE_CACHE)) { + btrfs_clear_and_info(info, SPACE_CACHE, + "disabling disk space caching"); + } + if (btrfs_test_opt(info, FREE_SPACE_TREE)) { + btrfs_clear_and_info(info, FREE_SPACE_TREE, + "disabling free space tree"); + } + break; + case Opt_inode_cache: + case Opt_noinode_cache: + btrfs_warn(info, + "the 'inode_cache' option is deprecated and has no effect since 5.11"); + break; + case Opt_clear_cache: + /* + * We cannot clear the free space tree with extent tree + * v2, ignore this option. + */ + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) + break; + btrfs_set_and_info(info, CLEAR_CACHE, + "force clearing of disk cache"); + break; + case Opt_user_subvol_rm_allowed: + btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); + break; + case Opt_enospc_debug: + btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); + break; + case Opt_noenospc_debug: + btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG); + break; + case Opt_defrag: + btrfs_set_and_info(info, AUTO_DEFRAG, + "enabling auto defrag"); + break; + case Opt_nodefrag: + btrfs_clear_and_info(info, AUTO_DEFRAG, + "disabling auto defrag"); + break; + case Opt_recovery: + case Opt_usebackuproot: + btrfs_warn(info, + "'%s' is deprecated, use 'rescue=usebackuproot' instead", + token == Opt_recovery ? "recovery" : + "usebackuproot"); + btrfs_info(info, + "trying to use backup root at mount time"); + btrfs_set_opt(info->mount_opt, USEBACKUPROOT); + break; + case Opt_skip_balance: + btrfs_set_opt(info->mount_opt, SKIP_BALANCE); + break; +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + case Opt_check_integrity_including_extent_data: + btrfs_warn(info, + "integrity checker is deprecated and will be removed in 6.7"); + btrfs_info(info, + "enabling check integrity including extent data"); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + break; + case Opt_check_integrity: + btrfs_warn(info, + "integrity checker is deprecated and will be removed in 6.7"); + btrfs_info(info, "enabling check integrity"); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + break; + case Opt_check_integrity_print_mask: + ret = match_int(&args[0], &intarg); + if (ret) { + btrfs_err(info, + "unrecognized check_integrity_print_mask value %s", + args[0].from); + goto out; + } + info->check_integrity_print_mask = intarg; + btrfs_warn(info, + "integrity checker is deprecated and will be removed in 6.7"); + btrfs_info(info, "check_integrity_print_mask 0x%x", + info->check_integrity_print_mask); + break; +#else + case Opt_check_integrity_including_extent_data: + case Opt_check_integrity: + case Opt_check_integrity_print_mask: + btrfs_err(info, + "support for check_integrity* not compiled in!"); + ret = -EINVAL; + goto out; +#endif + case Opt_fatal_errors: + if (strcmp(args[0].from, "panic") == 0) { + btrfs_set_opt(info->mount_opt, + PANIC_ON_FATAL_ERROR); + } else if (strcmp(args[0].from, "bug") == 0) { + btrfs_clear_opt(info->mount_opt, + PANIC_ON_FATAL_ERROR); + } else { + btrfs_err(info, "unrecognized fatal_errors value %s", + args[0].from); + ret = -EINVAL; + goto out; + } + break; + case Opt_commit_interval: + intarg = 0; + ret = match_int(&args[0], &intarg); + if (ret) { + btrfs_err(info, "unrecognized commit_interval value %s", + args[0].from); + ret = -EINVAL; + goto out; + } + if (intarg == 0) { + btrfs_info(info, + "using default commit interval %us", + BTRFS_DEFAULT_COMMIT_INTERVAL); + intarg = BTRFS_DEFAULT_COMMIT_INTERVAL; + } else if (intarg > 300) { + btrfs_warn(info, "excessive commit interval %d", + intarg); + } + info->commit_interval = intarg; + break; + case Opt_rescue: + ret = parse_rescue_options(info, args[0].from); + if (ret < 0) { + btrfs_err(info, "unrecognized rescue value %s", + args[0].from); + goto out; + } + break; +#ifdef CONFIG_BTRFS_DEBUG + case Opt_fragment_all: + btrfs_info(info, "fragmenting all space"); + btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); + btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA); + break; + case Opt_fragment_metadata: + btrfs_info(info, "fragmenting metadata"); + btrfs_set_opt(info->mount_opt, + FRAGMENT_METADATA); + break; + case Opt_fragment_data: + btrfs_info(info, "fragmenting data"); + btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); + break; +#endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + case Opt_ref_verify: + btrfs_info(info, "doing ref verification"); + btrfs_set_opt(info->mount_opt, REF_VERIFY); + break; +#endif + case Opt_err: + btrfs_err(info, "unrecognized mount option '%s'", p); + ret = -EINVAL; + goto out; + default: + break; + } + } +check: + /* We're read-only, don't have to check. */ + if (new_flags & SB_RDONLY) + goto out; + + if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || + check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") || + check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums")) + ret = -EINVAL; +out: + if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && + !btrfs_test_opt(info, FREE_SPACE_TREE) && + !btrfs_test_opt(info, CLEAR_CACHE)) { + btrfs_err(info, "cannot disable free space tree"); + ret = -EINVAL; + } + if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) && + !btrfs_test_opt(info, FREE_SPACE_TREE)) { + btrfs_err(info, "cannot disable free space tree with block-group-tree feature"); + ret = -EINVAL; + } + if (!ret) + ret = btrfs_check_mountopts_zoned(info); + if (!ret && !remounting) { + if (btrfs_test_opt(info, SPACE_CACHE)) + btrfs_info(info, "disk space caching is enabled"); + if (btrfs_test_opt(info, FREE_SPACE_TREE)) + btrfs_info(info, "using free space tree"); + } + return ret; +} + +/* + * Parse mount options that are required early in the mount process. + * + * All other options will be parsed on much later in the mount process and + * only when we need to allocate a new super block. + */ +static int btrfs_parse_device_options(const char *options, blk_mode_t flags) +{ + substring_t args[MAX_OPT_ARGS]; + char *device_name, *opts, *orig, *p; + struct btrfs_device *device = NULL; + int error = 0; + + lockdep_assert_held(&uuid_mutex); + + if (!options) + return 0; + + /* + * strsep changes the string, duplicate it because btrfs_parse_options + * gets called later + */ + opts = kstrdup(options, GFP_KERNEL); + if (!opts) + return -ENOMEM; + orig = opts; + + while ((p = strsep(&opts, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + if (token == Opt_device) { + device_name = match_strdup(&args[0]); + if (!device_name) { + error = -ENOMEM; + goto out; + } + device = btrfs_scan_one_device(device_name, flags); + kfree(device_name); + if (IS_ERR(device)) { + error = PTR_ERR(device); + goto out; + } + } + } + +out: + kfree(orig); + return error; +} + +/* + * Parse mount options that are related to subvolume id + * + * The value is later passed to mount_subvol() + */ +static int btrfs_parse_subvol_options(const char *options, char **subvol_name, + u64 *subvol_objectid) +{ + substring_t args[MAX_OPT_ARGS]; + char *opts, *orig, *p; + int error = 0; + u64 subvolid; + + if (!options) + return 0; + + /* + * strsep changes the string, duplicate it because + * btrfs_parse_device_options gets called later + */ + opts = kstrdup(options, GFP_KERNEL); + if (!opts) + return -ENOMEM; + orig = opts; + + while ((p = strsep(&opts, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_subvol: + kfree(*subvol_name); + *subvol_name = match_strdup(&args[0]); + if (!*subvol_name) { + error = -ENOMEM; + goto out; + } + break; + case Opt_subvolid: + error = match_u64(&args[0], &subvolid); + if (error) + goto out; + + /* we want the original fs_tree */ + if (subvolid == 0) + subvolid = BTRFS_FS_TREE_OBJECTID; + + *subvol_objectid = subvolid; + break; + default: + break; + } + } + +out: + kfree(orig); + return error; +} + +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid) +{ + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_root *fs_root = NULL; + struct btrfs_root_ref *root_ref; + struct btrfs_inode_ref *inode_ref; + struct btrfs_key key; + struct btrfs_path *path = NULL; + char *name = NULL, *ptr; + u64 dirid; + int len; + int ret; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto err; + } + + name = kmalloc(PATH_MAX, GFP_KERNEL); + if (!name) { + ret = -ENOMEM; + goto err; + } + ptr = name + PATH_MAX - 1; + ptr[0] = '\0'; + + /* + * Walk up the subvolume trees in the tree of tree roots by root + * backrefs until we hit the top-level subvolume. + */ + while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) { + key.objectid = subvol_objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_backwards(root, &key, path); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = -ENOENT; + goto err; + } + + subvol_objectid = key.offset; + + root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_root_ref); + len = btrfs_root_ref_name_len(path->nodes[0], root_ref); + ptr -= len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto err; + } + read_extent_buffer(path->nodes[0], ptr + 1, + (unsigned long)(root_ref + 1), len); + ptr[0] = '/'; + dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref); + btrfs_release_path(path); + + fs_root = btrfs_get_fs_root(fs_info, subvol_objectid, true); + if (IS_ERR(fs_root)) { + ret = PTR_ERR(fs_root); + fs_root = NULL; + goto err; + } + + /* + * Walk up the filesystem tree by inode refs until we hit the + * root directory. + */ + while (dirid != BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = dirid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_backwards(fs_root, &key, path); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = -ENOENT; + goto err; + } + + dirid = key.offset; + + inode_ref = btrfs_item_ptr(path->nodes[0], + path->slots[0], + struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(path->nodes[0], + inode_ref); + ptr -= len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto err; + } + read_extent_buffer(path->nodes[0], ptr + 1, + (unsigned long)(inode_ref + 1), len); + ptr[0] = '/'; + btrfs_release_path(path); + } + btrfs_put_root(fs_root); + fs_root = NULL; + } + + btrfs_free_path(path); + if (ptr == name + PATH_MAX - 1) { + name[0] = '/'; + name[1] = '\0'; + } else { + memmove(name, ptr, name + PATH_MAX - ptr); + } + return name; + +err: + btrfs_put_root(fs_root); + btrfs_free_path(path); + kfree(name); + return ERR_PTR(ret); +} + +static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid) +{ + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_key location; + struct fscrypt_str name = FSTR_INIT("default", 7); + u64 dir_id; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Find the "default" dir item which points to the root item that we + * will mount by default if we haven't been given a specific subvolume + * to mount. + */ + dir_id = btrfs_super_root_dir(fs_info->super_copy); + di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0); + if (IS_ERR(di)) { + btrfs_free_path(path); + return PTR_ERR(di); + } + if (!di) { + /* + * Ok the default dir item isn't there. This is weird since + * it's always been there, but don't freak out, just try and + * mount the top-level subvolume. + */ + btrfs_free_path(path); + *objectid = BTRFS_FS_TREE_OBJECTID; + return 0; + } + + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + btrfs_free_path(path); + *objectid = location.objectid; + return 0; +} + +static int btrfs_fill_super(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + void *data) +{ + struct inode *inode; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + int err; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_magic = BTRFS_SUPER_MAGIC; + sb->s_op = &btrfs_super_ops; + sb->s_d_op = &btrfs_dentry_operations; + sb->s_export_op = &btrfs_export_ops; +#ifdef CONFIG_FS_VERITY + sb->s_vop = &btrfs_verityops; +#endif + sb->s_xattr = btrfs_xattr_handlers; + sb->s_time_gran = 1; +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + sb->s_flags |= SB_POSIXACL; +#endif + sb->s_flags |= SB_I_VERSION; + sb->s_iflags |= SB_I_CGROUPWB; + + err = super_setup_bdi(sb); + if (err) { + btrfs_err(fs_info, "super_setup_bdi failed"); + return err; + } + + err = open_ctree(sb, fs_devices, (char *)data); + if (err) { + btrfs_err(fs_info, "open_ctree failed"); + return err; + } + + inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + btrfs_handle_fs_error(fs_info, err, NULL); + goto fail_close; + } + + sb->s_root = d_make_root(inode); + if (!sb->s_root) { + err = -ENOMEM; + goto fail_close; + } + + sb->s_flags |= SB_ACTIVE; + return 0; + +fail_close: + close_ctree(fs_info); + return err; +} + +int btrfs_sync_fs(struct super_block *sb, int wait) +{ + struct btrfs_trans_handle *trans; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; + + trace_btrfs_sync_fs(fs_info, wait); + + if (!wait) { + filemap_flush(fs_info->btree_inode->i_mapping); + return 0; + } + + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + /* no transaction, don't bother */ + if (PTR_ERR(trans) == -ENOENT) { + /* + * Exit unless we have some pending changes + * that need to go through commit + */ + if (!test_bit(BTRFS_FS_NEED_TRANS_COMMIT, + &fs_info->flags)) + return 0; + /* + * A non-blocking test if the fs is frozen. We must not + * start a new transaction here otherwise a deadlock + * happens. The pending operations are delayed to the + * next commit after thawing. + */ + if (sb_start_write_trylock(sb)) + sb_end_write(sb); + else + return 0; + trans = btrfs_start_transaction(root, 0); + } + if (IS_ERR(trans)) + return PTR_ERR(trans); + } + return btrfs_commit_transaction(trans); +} + +static void print_rescue_option(struct seq_file *seq, const char *s, bool *printed) +{ + seq_printf(seq, "%s%s", (*printed) ? ":" : ",rescue=", s); + *printed = true; +} + +static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) +{ + struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); + const char *compress_type; + const char *subvol_name; + bool printed = false; + + if (btrfs_test_opt(info, DEGRADED)) + seq_puts(seq, ",degraded"); + if (btrfs_test_opt(info, NODATASUM)) + seq_puts(seq, ",nodatasum"); + if (btrfs_test_opt(info, NODATACOW)) + seq_puts(seq, ",nodatacow"); + if (btrfs_test_opt(info, NOBARRIER)) + seq_puts(seq, ",nobarrier"); + if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) + seq_printf(seq, ",max_inline=%llu", info->max_inline); + if (info->thread_pool_size != min_t(unsigned long, + num_online_cpus() + 2, 8)) + seq_printf(seq, ",thread_pool=%u", info->thread_pool_size); + if (btrfs_test_opt(info, COMPRESS)) { + compress_type = btrfs_compress_type2str(info->compress_type); + if (btrfs_test_opt(info, FORCE_COMPRESS)) + seq_printf(seq, ",compress-force=%s", compress_type); + else + seq_printf(seq, ",compress=%s", compress_type); + if (info->compress_level) + seq_printf(seq, ":%d", info->compress_level); + } + if (btrfs_test_opt(info, NOSSD)) + seq_puts(seq, ",nossd"); + if (btrfs_test_opt(info, SSD_SPREAD)) + seq_puts(seq, ",ssd_spread"); + else if (btrfs_test_opt(info, SSD)) + seq_puts(seq, ",ssd"); + if (btrfs_test_opt(info, NOTREELOG)) + seq_puts(seq, ",notreelog"); + if (btrfs_test_opt(info, NOLOGREPLAY)) + print_rescue_option(seq, "nologreplay", &printed); + if (btrfs_test_opt(info, USEBACKUPROOT)) + print_rescue_option(seq, "usebackuproot", &printed); + if (btrfs_test_opt(info, IGNOREBADROOTS)) + print_rescue_option(seq, "ignorebadroots", &printed); + if (btrfs_test_opt(info, IGNOREDATACSUMS)) + print_rescue_option(seq, "ignoredatacsums", &printed); + if (btrfs_test_opt(info, FLUSHONCOMMIT)) + seq_puts(seq, ",flushoncommit"); + if (btrfs_test_opt(info, DISCARD_SYNC)) + seq_puts(seq, ",discard"); + if (btrfs_test_opt(info, DISCARD_ASYNC)) + seq_puts(seq, ",discard=async"); + if (!(info->sb->s_flags & SB_POSIXACL)) + seq_puts(seq, ",noacl"); + if (btrfs_free_space_cache_v1_active(info)) + seq_puts(seq, ",space_cache"); + else if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) + seq_puts(seq, ",space_cache=v2"); + else + seq_puts(seq, ",nospace_cache"); + if (btrfs_test_opt(info, RESCAN_UUID_TREE)) + seq_puts(seq, ",rescan_uuid_tree"); + if (btrfs_test_opt(info, CLEAR_CACHE)) + seq_puts(seq, ",clear_cache"); + if (btrfs_test_opt(info, USER_SUBVOL_RM_ALLOWED)) + seq_puts(seq, ",user_subvol_rm_allowed"); + if (btrfs_test_opt(info, ENOSPC_DEBUG)) + seq_puts(seq, ",enospc_debug"); + if (btrfs_test_opt(info, AUTO_DEFRAG)) + seq_puts(seq, ",autodefrag"); + if (btrfs_test_opt(info, SKIP_BALANCE)) + seq_puts(seq, ",skip_balance"); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA)) + seq_puts(seq, ",check_int_data"); + else if (btrfs_test_opt(info, CHECK_INTEGRITY)) + seq_puts(seq, ",check_int"); + if (info->check_integrity_print_mask) + seq_printf(seq, ",check_int_print_mask=%d", + info->check_integrity_print_mask); +#endif + if (info->metadata_ratio) + seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio); + if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR)) + seq_puts(seq, ",fatal_errors=panic"); + if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) + seq_printf(seq, ",commit=%u", info->commit_interval); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_test_opt(info, FRAGMENT_DATA)) + seq_puts(seq, ",fragment=data"); + if (btrfs_test_opt(info, FRAGMENT_METADATA)) + seq_puts(seq, ",fragment=metadata"); +#endif + if (btrfs_test_opt(info, REF_VERIFY)) + seq_puts(seq, ",ref_verify"); + seq_printf(seq, ",subvolid=%llu", + BTRFS_I(d_inode(dentry))->root->root_key.objectid); + subvol_name = btrfs_get_subvol_name_from_objectid(info, + BTRFS_I(d_inode(dentry))->root->root_key.objectid); + if (!IS_ERR(subvol_name)) { + seq_puts(seq, ",subvol="); + seq_escape(seq, subvol_name, " \t\n\\"); + kfree(subvol_name); + } + return 0; +} + +static int btrfs_test_super(struct super_block *s, void *data) +{ + struct btrfs_fs_info *p = data; + struct btrfs_fs_info *fs_info = btrfs_sb(s); + + return fs_info->fs_devices == p->fs_devices; +} + +static int btrfs_set_super(struct super_block *s, void *data) +{ + int err = set_anon_super(s, data); + if (!err) + s->s_fs_info = data; + return err; +} + +/* + * subvolumes are identified by ino 256 + */ +static inline int is_subvolume_inode(struct inode *inode) +{ + if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + return 1; + return 0; +} + +static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, + struct vfsmount *mnt) +{ + struct dentry *root; + int ret; + + if (!subvol_name) { + if (!subvol_objectid) { + ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb), + &subvol_objectid); + if (ret) { + root = ERR_PTR(ret); + goto out; + } + } + subvol_name = btrfs_get_subvol_name_from_objectid( + btrfs_sb(mnt->mnt_sb), subvol_objectid); + if (IS_ERR(subvol_name)) { + root = ERR_CAST(subvol_name); + subvol_name = NULL; + goto out; + } + + } + + root = mount_subtree(mnt, subvol_name); + /* mount_subtree() drops our reference on the vfsmount. */ + mnt = NULL; + + if (!IS_ERR(root)) { + struct super_block *s = root->d_sb; + struct btrfs_fs_info *fs_info = btrfs_sb(s); + struct inode *root_inode = d_inode(root); + u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid; + + ret = 0; + if (!is_subvolume_inode(root_inode)) { + btrfs_err(fs_info, "'%s' is not a valid subvolume", + subvol_name); + ret = -EINVAL; + } + if (subvol_objectid && root_objectid != subvol_objectid) { + /* + * This will also catch a race condition where a + * subvolume which was passed by ID is renamed and + * another subvolume is renamed over the old location. + */ + btrfs_err(fs_info, + "subvol '%s' does not match subvolid %llu", + subvol_name, subvol_objectid); + ret = -EINVAL; + } + if (ret) { + dput(root); + root = ERR_PTR(ret); + deactivate_locked_super(s); + } + } + +out: + mntput(mnt); + kfree(subvol_name); + return root; +} + +/* + * Find a superblock for the given device / mount point. + * + * Note: This is based on mount_bdev from fs/super.c with a few additions + * for multiple device setup. Make sure to keep it in sync. + */ +static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, + int flags, const char *device_name, void *data) +{ + struct block_device *bdev = NULL; + struct super_block *s; + struct btrfs_device *device = NULL; + struct btrfs_fs_devices *fs_devices = NULL; + struct btrfs_fs_info *fs_info = NULL; + void *new_sec_opts = NULL; + blk_mode_t mode = sb_open_mode(flags); + int error = 0; + + if (data) { + error = security_sb_eat_lsm_opts(data, &new_sec_opts); + if (error) + return ERR_PTR(error); + } + + /* + * Setup a dummy root and fs_info for test/set super. This is because + * we don't actually fill this stuff out until open_ctree, but we need + * then open_ctree will properly initialize the file system specific + * settings later. btrfs_init_fs_info initializes the static elements + * of the fs_info (locks and such) to make cleanup easier if we find a + * superblock with our given fs_devices later on at sget() time. + */ + fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); + if (!fs_info) { + error = -ENOMEM; + goto error_sec_opts; + } + btrfs_init_fs_info(fs_info); + + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); + if (!fs_info->super_copy || !fs_info->super_for_commit) { + error = -ENOMEM; + goto error_fs_info; + } + + mutex_lock(&uuid_mutex); + error = btrfs_parse_device_options(data, mode); + if (error) { + mutex_unlock(&uuid_mutex); + goto error_fs_info; + } + + device = btrfs_scan_one_device(device_name, mode); + if (IS_ERR(device)) { + mutex_unlock(&uuid_mutex); + error = PTR_ERR(device); + goto error_fs_info; + } + + fs_devices = device->fs_devices; + fs_info->fs_devices = fs_devices; + + error = btrfs_open_devices(fs_devices, mode, fs_type); + mutex_unlock(&uuid_mutex); + if (error) + goto error_fs_info; + + if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) { + error = -EACCES; + goto error_close_devices; + } + + bdev = fs_devices->latest_dev->bdev; + s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC, + fs_info); + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto error_close_devices; + } + + if (s->s_root) { + btrfs_close_devices(fs_devices); + btrfs_free_fs_info(fs_info); + if ((flags ^ s->s_flags) & SB_RDONLY) + error = -EBUSY; + } else { + snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); + shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name, + s->s_id); + btrfs_sb(s)->bdev_holder = fs_type; + error = btrfs_fill_super(s, fs_devices, data); + } + if (!error) + error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL); + security_free_mnt_opts(&new_sec_opts); + if (error) { + deactivate_locked_super(s); + return ERR_PTR(error); + } + + return dget(s->s_root); + +error_close_devices: + btrfs_close_devices(fs_devices); +error_fs_info: + btrfs_free_fs_info(fs_info); +error_sec_opts: + security_free_mnt_opts(&new_sec_opts); + return ERR_PTR(error); +} + +/* + * Mount function which is called by VFS layer. + * + * In order to allow mounting a subvolume directly, btrfs uses mount_subtree() + * which needs vfsmount* of device's root (/). This means device's root has to + * be mounted internally in any case. + * + * Operation flow: + * 1. Parse subvol id related options for later use in mount_subvol(). + * + * 2. Mount device's root (/) by calling vfs_kern_mount(). + * + * NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the + * first place. In order to avoid calling btrfs_mount() again, we use + * different file_system_type which is not registered to VFS by + * register_filesystem() (btrfs_root_fs_type). As a result, + * btrfs_mount_root() is called. The return value will be used by + * mount_subtree() in mount_subvol(). + * + * 3. Call mount_subvol() to get the dentry of subvolume. Since there is + * "btrfs subvolume set-default", mount_subvol() is called always. + */ +static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, + const char *device_name, void *data) +{ + struct vfsmount *mnt_root; + struct dentry *root; + char *subvol_name = NULL; + u64 subvol_objectid = 0; + int error = 0; + + error = btrfs_parse_subvol_options(data, &subvol_name, + &subvol_objectid); + if (error) { + kfree(subvol_name); + return ERR_PTR(error); + } + + /* mount device's root (/) */ + mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data); + if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) { + if (flags & SB_RDONLY) { + mnt_root = vfs_kern_mount(&btrfs_root_fs_type, + flags & ~SB_RDONLY, device_name, data); + } else { + mnt_root = vfs_kern_mount(&btrfs_root_fs_type, + flags | SB_RDONLY, device_name, data); + if (IS_ERR(mnt_root)) { + root = ERR_CAST(mnt_root); + kfree(subvol_name); + goto out; + } + + down_write(&mnt_root->mnt_sb->s_umount); + error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL); + up_write(&mnt_root->mnt_sb->s_umount); + if (error < 0) { + root = ERR_PTR(error); + mntput(mnt_root); + kfree(subvol_name); + goto out; + } + } + } + if (IS_ERR(mnt_root)) { + root = ERR_CAST(mnt_root); + kfree(subvol_name); + goto out; + } + + /* mount_subvol() will free subvol_name and mnt_root */ + root = mount_subvol(subvol_name, subvol_objectid, mnt_root); + +out: + return root; +} + +static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, + u32 new_pool_size, u32 old_pool_size) +{ + if (new_pool_size == old_pool_size) + return; + + fs_info->thread_pool_size = new_pool_size; + + btrfs_info(fs_info, "resize thread pool %d -> %d", + old_pool_size, new_pool_size); + + btrfs_workqueue_set_max(fs_info->workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); + workqueue_set_max_active(fs_info->endio_workers, new_pool_size); + workqueue_set_max_active(fs_info->endio_meta_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); + btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); +} + +static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, + unsigned long old_opts, int flags) +{ + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && + (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || + (flags & SB_RDONLY))) { + /* wait for any defraggers to finish */ + wait_event(fs_info->transaction_wait, + (atomic_read(&fs_info->defrag_running) == 0)); + if (flags & SB_RDONLY) + sync_filesystem(fs_info->sb); + } +} + +static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, + unsigned long old_opts) +{ + const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); + + /* + * We need to cleanup all defragable inodes if the autodefragment is + * close or the filesystem is read only. + */ + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && + (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || sb_rdonly(fs_info->sb))) { + btrfs_cleanup_defrag_inodes(fs_info); + } + + /* If we toggled discard async */ + if (!btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) && + btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_discard_resume(fs_info); + else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) && + !btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_discard_cleanup(fs_info); + + /* If we toggled space cache */ + if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) + btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); +} + +static int btrfs_remount(struct super_block *sb, int *flags, char *data) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + unsigned old_flags = sb->s_flags; + unsigned long old_opts = fs_info->mount_opt; + unsigned long old_compress_type = fs_info->compress_type; + u64 old_max_inline = fs_info->max_inline; + u32 old_thread_pool_size = fs_info->thread_pool_size; + u32 old_metadata_ratio = fs_info->metadata_ratio; + int ret; + + sync_filesystem(sb); + set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + + if (data) { + void *new_sec_opts = NULL; + + ret = security_sb_eat_lsm_opts(data, &new_sec_opts); + if (!ret) + ret = security_sb_remount(sb, new_sec_opts); + security_free_mnt_opts(&new_sec_opts); + if (ret) + goto restore; + } + + ret = btrfs_parse_options(fs_info, data, *flags); + if (ret) + goto restore; + + ret = btrfs_check_features(fs_info, !(*flags & SB_RDONLY)); + if (ret < 0) + goto restore; + + btrfs_remount_begin(fs_info, old_opts, *flags); + btrfs_resize_thread_pool(fs_info, + fs_info->thread_pool_size, old_thread_pool_size); + + if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) != + (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + (!sb_rdonly(sb) || (*flags & SB_RDONLY))) { + btrfs_warn(fs_info, + "remount supports changing free space tree only from ro to rw"); + /* Make sure free space cache options match the state on disk */ + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + } + if (btrfs_free_space_cache_v1_active(fs_info)) { + btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE); + btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); + } + } + + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) + goto out; + + if (*flags & SB_RDONLY) { + /* + * this also happens on 'umount -rf' or on shutdown, when + * the filesystem is busy. + */ + cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); + + btrfs_discard_cleanup(fs_info); + + /* wait for the uuid_scan task to finish */ + down(&fs_info->uuid_tree_rescan_sem); + /* avoid complains from lockdep et al. */ + up(&fs_info->uuid_tree_rescan_sem); + + btrfs_set_sb_rdonly(sb); + + /* + * Setting SB_RDONLY will put the cleaner thread to + * sleep at the next loop if it's already active. + * If it's already asleep, we'll leave unused block + * groups on disk until we're mounted read-write again + * unless we clean them up here. + */ + btrfs_delete_unused_bgs(fs_info); + + /* + * The cleaner task could be already running before we set the + * flag BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock). + * We must make sure that after we finish the remount, i.e. after + * we call btrfs_commit_super(), the cleaner can no longer start + * a transaction - either because it was dropping a dead root, + * running delayed iputs or deleting an unused block group (the + * cleaner picked a block group from the list of unused block + * groups before we were able to in the previous call to + * btrfs_delete_unused_bgs()). + */ + wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, + TASK_UNINTERRUPTIBLE); + + /* + * We've set the superblock to RO mode, so we might have made + * the cleaner task sleep without running all pending delayed + * iputs. Go through all the delayed iputs here, so that if an + * unmount happens without remounting RW we don't end up at + * finishing close_ctree() with a non-empty list of delayed + * iputs. + */ + btrfs_run_delayed_iputs(fs_info); + + btrfs_dev_replace_suspend_for_unmount(fs_info); + btrfs_scrub_cancel(fs_info); + btrfs_pause_balance(fs_info); + + /* + * Pause the qgroup rescan worker if it is running. We don't want + * it to be still running after we are in RO mode, as after that, + * by the time we unmount, it might have left a transaction open, + * so we would leak the transaction and/or crash. + */ + btrfs_qgroup_wait_for_completion(fs_info, false); + + ret = btrfs_commit_super(fs_info); + if (ret) + goto restore; + } else { + if (BTRFS_FS_ERROR(fs_info)) { + btrfs_err(fs_info, + "Remounting read-write after error is not allowed"); + ret = -EINVAL; + goto restore; + } + if (fs_info->fs_devices->rw_devices == 0) { + ret = -EACCES; + goto restore; + } + + if (!btrfs_check_rw_degradable(fs_info, NULL)) { + btrfs_warn(fs_info, + "too many missing devices, writable remount is not allowed"); + ret = -EACCES; + goto restore; + } + + if (btrfs_super_log_root(fs_info->super_copy) != 0) { + btrfs_warn(fs_info, + "mount required to replay tree-log, cannot remount read-write"); + ret = -EINVAL; + goto restore; + } + + /* + * NOTE: when remounting with a change that does writes, don't + * put it anywhere above this point, as we are not sure to be + * safe to write until we pass the above checks. + */ + ret = btrfs_start_pre_rw_mount(fs_info); + if (ret) + goto restore; + + btrfs_clear_sb_rdonly(sb); + + set_bit(BTRFS_FS_OPEN, &fs_info->flags); + + /* + * If we've gone from readonly -> read/write, we need to get + * our sync/async discard lists in the right state. + */ + btrfs_discard_resume(fs_info); + } +out: + /* + * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS, + * since the absence of the flag means it can be toggled off by remount. + */ + *flags |= SB_I_VERSION; + + wake_up_process(fs_info->transaction_kthread); + btrfs_remount_cleanup(fs_info, old_opts); + btrfs_clear_oneshot_options(fs_info); + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + + return 0; + +restore: + /* We've hit an error - don't reset SB_RDONLY */ + if (sb_rdonly(sb)) + old_flags |= SB_RDONLY; + if (!(old_flags & SB_RDONLY)) + clear_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); + sb->s_flags = old_flags; + fs_info->mount_opt = old_opts; + fs_info->compress_type = old_compress_type; + fs_info->max_inline = old_max_inline; + btrfs_resize_thread_pool(fs_info, + old_thread_pool_size, fs_info->thread_pool_size); + fs_info->metadata_ratio = old_metadata_ratio; + btrfs_remount_cleanup(fs_info, old_opts); + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + + return ret; +} + +/* Used to sort the devices by max_avail(descending sort) */ +static int btrfs_cmp_device_free_bytes(const void *a, const void *b) +{ + const struct btrfs_device_info *dev_info1 = a; + const struct btrfs_device_info *dev_info2 = b; + + if (dev_info1->max_avail > dev_info2->max_avail) + return -1; + else if (dev_info1->max_avail < dev_info2->max_avail) + return 1; + return 0; +} + +/* + * sort the devices by max_avail, in which max free extent size of each device + * is stored.(Descending Sort) + */ +static inline void btrfs_descending_sort_devices( + struct btrfs_device_info *devices, + size_t nr_devices) +{ + sort(devices, nr_devices, sizeof(struct btrfs_device_info), + btrfs_cmp_device_free_bytes, NULL); +} + +/* + * The helper to calc the free space on the devices that can be used to store + * file data. + */ +static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, + u64 *free_bytes) +{ + struct btrfs_device_info *devices_info; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 type; + u64 avail_space; + u64 min_stripe_size; + int num_stripes = 1; + int i = 0, nr_devices; + const struct btrfs_raid_attr *rattr; + + /* + * We aren't under the device list lock, so this is racy-ish, but good + * enough for our purposes. + */ + nr_devices = fs_info->fs_devices->open_devices; + if (!nr_devices) { + smp_mb(); + nr_devices = fs_info->fs_devices->open_devices; + ASSERT(nr_devices); + if (!nr_devices) { + *free_bytes = 0; + return 0; + } + } + + devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), + GFP_KERNEL); + if (!devices_info) + return -ENOMEM; + + /* calc min stripe number for data space allocation */ + type = btrfs_data_alloc_profile(fs_info); + rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)]; + + if (type & BTRFS_BLOCK_GROUP_RAID0) + num_stripes = nr_devices; + else if (type & BTRFS_BLOCK_GROUP_RAID1_MASK) + num_stripes = rattr->ncopies; + else if (type & BTRFS_BLOCK_GROUP_RAID10) + num_stripes = 4; + + /* Adjust for more than 1 stripe per device */ + min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN; + + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &device->dev_state) || + !device->bdev || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) + continue; + + if (i >= nr_devices) + break; + + avail_space = device->total_bytes - device->bytes_used; + + /* align with stripe_len */ + avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN); + + /* + * Ensure we have at least min_stripe_size on top of the + * reserved space on the device. + */ + if (avail_space <= BTRFS_DEVICE_RANGE_RESERVED + min_stripe_size) + continue; + + avail_space -= BTRFS_DEVICE_RANGE_RESERVED; + + devices_info[i].dev = device; + devices_info[i].max_avail = avail_space; + + i++; + } + rcu_read_unlock(); + + nr_devices = i; + + btrfs_descending_sort_devices(devices_info, nr_devices); + + i = nr_devices - 1; + avail_space = 0; + while (nr_devices >= rattr->devs_min) { + num_stripes = min(num_stripes, nr_devices); + + if (devices_info[i].max_avail >= min_stripe_size) { + int j; + u64 alloc_size; + + avail_space += devices_info[i].max_avail * num_stripes; + alloc_size = devices_info[i].max_avail; + for (j = i + 1 - num_stripes; j <= i; j++) + devices_info[j].max_avail -= alloc_size; + } + i--; + nr_devices--; + } + + kfree(devices_info); + *free_bytes = avail_space; + return 0; +} + +/* + * Calculate numbers for 'df', pessimistic in case of mixed raid profiles. + * + * If there's a redundant raid level at DATA block groups, use the respective + * multiplier to scale the sizes. + * + * Unused device space usage is based on simulating the chunk allocator + * algorithm that respects the device sizes and order of allocations. This is + * a close approximation of the actual use but there are other factors that may + * change the result (like a new metadata chunk). + * + * If metadata is exhausted, f_bavail will be 0. + */ +static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); + struct btrfs_super_block *disk_super = fs_info->super_copy; + struct btrfs_space_info *found; + u64 total_used = 0; + u64 total_free_data = 0; + u64 total_free_meta = 0; + u32 bits = fs_info->sectorsize_bits; + __be32 *fsid = (__be32 *)fs_info->fs_devices->fsid; + unsigned factor = 1; + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + int ret; + u64 thresh = 0; + int mixed = 0; + + list_for_each_entry(found, &fs_info->space_info, list) { + if (found->flags & BTRFS_BLOCK_GROUP_DATA) { + int i; + + total_free_data += found->disk_total - found->disk_used; + total_free_data -= + btrfs_account_ro_block_groups_free_space(found); + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (!list_empty(&found->block_groups[i])) + factor = btrfs_bg_type_to_factor( + btrfs_raid_array[i].bg_flag); + } + } + + /* + * Metadata in mixed block group profiles are accounted in data + */ + if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { + if (found->flags & BTRFS_BLOCK_GROUP_DATA) + mixed = 1; + else + total_free_meta += found->disk_total - + found->disk_used; + } + + total_used += found->disk_used; + } + + buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor); + buf->f_blocks >>= bits; + buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits); + + /* Account global block reserve as used, it's in logical size already */ + spin_lock(&block_rsv->lock); + /* Mixed block groups accounting is not byte-accurate, avoid overflow */ + if (buf->f_bfree >= block_rsv->size >> bits) + buf->f_bfree -= block_rsv->size >> bits; + else + buf->f_bfree = 0; + spin_unlock(&block_rsv->lock); + + buf->f_bavail = div_u64(total_free_data, factor); + ret = btrfs_calc_avail_data_space(fs_info, &total_free_data); + if (ret) + return ret; + buf->f_bavail += div_u64(total_free_data, factor); + buf->f_bavail = buf->f_bavail >> bits; + + /* + * We calculate the remaining metadata space minus global reserve. If + * this is (supposedly) smaller than zero, there's no space. But this + * does not hold in practice, the exhausted state happens where's still + * some positive delta. So we apply some guesswork and compare the + * delta to a 4M threshold. (Practically observed delta was ~2M.) + * + * We probably cannot calculate the exact threshold value because this + * depends on the internal reservations requested by various + * operations, so some operations that consume a few metadata will + * succeed even if the Avail is zero. But this is better than the other + * way around. + */ + thresh = SZ_4M; + + /* + * We only want to claim there's no available space if we can no longer + * allocate chunks for our metadata profile and our global reserve will + * not fit in the free metadata space. If we aren't ->full then we + * still can allocate chunks and thus are fine using the currently + * calculated f_bavail. + */ + if (!mixed && block_rsv->space_info->full && + (total_free_meta < thresh || total_free_meta - thresh < block_rsv->size)) + buf->f_bavail = 0; + + buf->f_type = BTRFS_SUPER_MAGIC; + buf->f_bsize = dentry->d_sb->s_blocksize; + buf->f_namelen = BTRFS_NAME_LEN; + + /* We treat it as constant endianness (it doesn't matter _which_) + because we want the fsid to come out the same whether mounted + on a big-endian or little-endian host */ + buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); + buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); + /* Mask in the root object ID too, to disambiguate subvols */ + buf->f_fsid.val[0] ^= + BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32; + buf->f_fsid.val[1] ^= + BTRFS_I(d_inode(dentry))->root->root_key.objectid; + + return 0; +} + +static void btrfs_kill_super(struct super_block *sb) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + kill_anon_super(sb); + btrfs_free_fs_info(fs_info); +} + +static struct file_system_type btrfs_fs_type = { + .owner = THIS_MODULE, + .name = "btrfs", + .mount = btrfs_mount, + .kill_sb = btrfs_kill_super, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, +}; + +static struct file_system_type btrfs_root_fs_type = { + .owner = THIS_MODULE, + .name = "btrfs", + .mount = btrfs_mount_root, + .kill_sb = btrfs_kill_super, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, +}; + +MODULE_ALIAS_FS("btrfs"); + +static int btrfs_control_open(struct inode *inode, struct file *file) +{ + /* + * The control file's private_data is used to hold the + * transaction when it is started and is used to keep + * track of whether a transaction is already in progress. + */ + file->private_data = NULL; + return 0; +} + +/* + * Used by /dev/btrfs-control for devices ioctls. + */ +static long btrfs_control_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct btrfs_ioctl_vol_args *vol; + struct btrfs_device *device = NULL; + dev_t devt = 0; + int ret = -ENOTTY; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol = memdup_user((void __user *)arg, sizeof(*vol)); + if (IS_ERR(vol)) + return PTR_ERR(vol); + vol->name[BTRFS_PATH_NAME_MAX] = '\0'; + + switch (cmd) { + case BTRFS_IOC_SCAN_DEV: + mutex_lock(&uuid_mutex); + device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ); + ret = PTR_ERR_OR_ZERO(device); + mutex_unlock(&uuid_mutex); + break; + case BTRFS_IOC_FORGET_DEV: + if (vol->name[0] != 0) { + ret = lookup_bdev(vol->name, &devt); + if (ret) + break; + } + ret = btrfs_forget_devices(devt); + break; + case BTRFS_IOC_DEVICES_READY: + mutex_lock(&uuid_mutex); + device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ); + if (IS_ERR(device)) { + mutex_unlock(&uuid_mutex); + ret = PTR_ERR(device); + break; + } + ret = !(device->fs_devices->num_devices == + device->fs_devices->total_devices); + mutex_unlock(&uuid_mutex); + break; + case BTRFS_IOC_GET_SUPPORTED_FEATURES: + ret = btrfs_ioctl_get_supported_features((void __user*)arg); + break; + } + + kfree(vol); + return ret; +} + +static int btrfs_freeze(struct super_block *sb) +{ + struct btrfs_trans_handle *trans; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; + + set_bit(BTRFS_FS_FROZEN, &fs_info->flags); + /* + * We don't need a barrier here, we'll wait for any transaction that + * could be in progress on other threads (and do delayed iputs that + * we want to avoid on a frozen filesystem), or do the commit + * ourselves. + */ + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + /* no transaction, don't bother */ + if (PTR_ERR(trans) == -ENOENT) + return 0; + return PTR_ERR(trans); + } + return btrfs_commit_transaction(trans); +} + +static int check_dev_super(struct btrfs_device *dev) +{ + struct btrfs_fs_info *fs_info = dev->fs_info; + struct btrfs_super_block *sb; + u16 csum_type; + int ret = 0; + + /* This should be called with fs still frozen. */ + ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags)); + + /* Missing dev, no need to check. */ + if (!dev->bdev) + return 0; + + /* Only need to check the primary super block. */ + sb = btrfs_read_dev_one_super(dev->bdev, 0, true); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + /* Verify the checksum. */ + csum_type = btrfs_super_csum_type(sb); + if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) { + btrfs_err(fs_info, "csum type changed, has %u expect %u", + csum_type, btrfs_super_csum_type(fs_info->super_copy)); + ret = -EUCLEAN; + goto out; + } + + if (btrfs_check_super_csum(fs_info, sb)) { + btrfs_err(fs_info, "csum for on-disk super block no longer matches"); + ret = -EUCLEAN; + goto out; + } + + /* Btrfs_validate_super() includes fsid check against super->fsid. */ + ret = btrfs_validate_super(fs_info, sb, 0); + if (ret < 0) + goto out; + + if (btrfs_super_generation(sb) != fs_info->last_trans_committed) { + btrfs_err(fs_info, "transid mismatch, has %llu expect %llu", + btrfs_super_generation(sb), + fs_info->last_trans_committed); + ret = -EUCLEAN; + goto out; + } +out: + btrfs_release_disk_super(sb); + return ret; +} + +static int btrfs_unfreeze(struct super_block *sb) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_device *device; + int ret = 0; + + /* + * Make sure the fs is not changed by accident (like hibernation then + * modified by other OS). + * If we found anything wrong, we mark the fs error immediately. + * + * And since the fs is frozen, no one can modify the fs yet, thus + * we don't need to hold device_list_mutex. + */ + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + ret = check_dev_super(device); + if (ret < 0) { + btrfs_handle_fs_error(fs_info, ret, + "super block on devid %llu got modified unexpectedly", + device->devid); + break; + } + } + clear_bit(BTRFS_FS_FROZEN, &fs_info->flags); + + /* + * We still return 0, to allow VFS layer to unfreeze the fs even the + * above checks failed. Since the fs is either fine or read-only, we're + * safe to continue, without causing further damage. + */ + return 0; +} + +static int btrfs_show_devname(struct seq_file *m, struct dentry *root) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); + + /* + * There should be always a valid pointer in latest_dev, it may be stale + * for a short moment in case it's being deleted but still valid until + * the end of RCU grace period. + */ + rcu_read_lock(); + seq_escape(m, btrfs_dev_name(fs_info->fs_devices->latest_dev), " \t\n\\"); + rcu_read_unlock(); + + return 0; +} + +static const struct super_operations btrfs_super_ops = { + .drop_inode = btrfs_drop_inode, + .evict_inode = btrfs_evict_inode, + .put_super = btrfs_put_super, + .sync_fs = btrfs_sync_fs, + .show_options = btrfs_show_options, + .show_devname = btrfs_show_devname, + .alloc_inode = btrfs_alloc_inode, + .destroy_inode = btrfs_destroy_inode, + .free_inode = btrfs_free_inode, + .statfs = btrfs_statfs, + .remount_fs = btrfs_remount, + .freeze_fs = btrfs_freeze, + .unfreeze_fs = btrfs_unfreeze, +}; + +static const struct file_operations btrfs_ctl_fops = { + .open = btrfs_control_open, + .unlocked_ioctl = btrfs_control_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice btrfs_misc = { + .minor = BTRFS_MINOR, + .name = "btrfs-control", + .fops = &btrfs_ctl_fops +}; + +MODULE_ALIAS_MISCDEV(BTRFS_MINOR); +MODULE_ALIAS("devname:btrfs-control"); + +static int __init btrfs_interface_init(void) +{ + return misc_register(&btrfs_misc); +} + +static __cold void btrfs_interface_exit(void) +{ + misc_deregister(&btrfs_misc); +} + +static int __init btrfs_print_mod_info(void) +{ + static const char options[] = "" +#ifdef CONFIG_BTRFS_DEBUG + ", debug=on" +#endif +#ifdef CONFIG_BTRFS_ASSERT + ", assert=on" +#endif +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + ", integrity-checker=on" +#endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + ", ref-verify=on" +#endif +#ifdef CONFIG_BLK_DEV_ZONED + ", zoned=yes" +#else + ", zoned=no" +#endif +#ifdef CONFIG_FS_VERITY + ", fsverity=yes" +#else + ", fsverity=no" +#endif + ; + pr_info("Btrfs loaded%s\n", options); + return 0; +} + +static int register_btrfs(void) +{ + return register_filesystem(&btrfs_fs_type); +} + +static void unregister_btrfs(void) +{ + unregister_filesystem(&btrfs_fs_type); +} + +/* Helper structure for long init/exit functions. */ +struct init_sequence { + int (*init_func)(void); + /* Can be NULL if the init_func doesn't need cleanup. */ + void (*exit_func)(void); +}; + +static const struct init_sequence mod_init_seq[] = { + { + .init_func = btrfs_props_init, + .exit_func = NULL, + }, { + .init_func = btrfs_init_sysfs, + .exit_func = btrfs_exit_sysfs, + }, { + .init_func = btrfs_init_compress, + .exit_func = btrfs_exit_compress, + }, { + .init_func = btrfs_init_cachep, + .exit_func = btrfs_destroy_cachep, + }, { + .init_func = btrfs_transaction_init, + .exit_func = btrfs_transaction_exit, + }, { + .init_func = btrfs_ctree_init, + .exit_func = btrfs_ctree_exit, + }, { + .init_func = btrfs_free_space_init, + .exit_func = btrfs_free_space_exit, + }, { + .init_func = extent_state_init_cachep, + .exit_func = extent_state_free_cachep, + }, { + .init_func = extent_buffer_init_cachep, + .exit_func = extent_buffer_free_cachep, + }, { + .init_func = btrfs_bioset_init, + .exit_func = btrfs_bioset_exit, + }, { + .init_func = extent_map_init, + .exit_func = extent_map_exit, + }, { + .init_func = ordered_data_init, + .exit_func = ordered_data_exit, + }, { + .init_func = btrfs_delayed_inode_init, + .exit_func = btrfs_delayed_inode_exit, + }, { + .init_func = btrfs_auto_defrag_init, + .exit_func = btrfs_auto_defrag_exit, + }, { + .init_func = btrfs_delayed_ref_init, + .exit_func = btrfs_delayed_ref_exit, + }, { + .init_func = btrfs_prelim_ref_init, + .exit_func = btrfs_prelim_ref_exit, + }, { + .init_func = btrfs_interface_init, + .exit_func = btrfs_interface_exit, + }, { + .init_func = btrfs_print_mod_info, + .exit_func = NULL, + }, { + .init_func = btrfs_run_sanity_tests, + .exit_func = NULL, + }, { + .init_func = register_btrfs, + .exit_func = unregister_btrfs, + } +}; + +static bool mod_init_result[ARRAY_SIZE(mod_init_seq)]; + +static __always_inline void btrfs_exit_btrfs_fs(void) +{ + int i; + + for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) { + if (!mod_init_result[i]) + continue; + if (mod_init_seq[i].exit_func) + mod_init_seq[i].exit_func(); + mod_init_result[i] = false; + } +} + +static void __exit exit_btrfs_fs(void) +{ + btrfs_exit_btrfs_fs(); + btrfs_cleanup_fs_uuids(); +} + +static int __init init_btrfs_fs(void) +{ + int ret; + int i; + + for (i = 0; i < ARRAY_SIZE(mod_init_seq); i++) { + ASSERT(!mod_init_result[i]); + ret = mod_init_seq[i].init_func(); + if (ret < 0) { + btrfs_exit_btrfs_fs(); + return ret; + } + mod_init_result[i] = true; + } + return 0; +} + +late_initcall(init_btrfs_fs); +module_exit(exit_btrfs_fs) + +MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: crc32c"); +MODULE_SOFTDEP("pre: xxhash64"); +MODULE_SOFTDEP("pre: sha256"); +MODULE_SOFTDEP("pre: blake2b-256"); diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h new file mode 100644 index 0000000000..8dbb909b36 --- /dev/null +++ b/fs/btrfs/super.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SUPER_H +#define BTRFS_SUPER_H + +int btrfs_parse_options(struct btrfs_fs_info *info, char *options, + unsigned long new_flags); +int btrfs_sync_fs(struct super_block *sb, int wait); +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid); + +static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline void btrfs_set_sb_rdonly(struct super_block *sb) +{ + sb->s_flags |= SB_RDONLY; + set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); +} + +static inline void btrfs_clear_sb_rdonly(struct super_block *sb) +{ + sb->s_flags &= ~SB_RDONLY; + clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); +} + +#endif diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c new file mode 100644 index 0000000000..c9198723e4 --- /dev/null +++ b/fs/btrfs/sysfs.c @@ -0,0 +1,2397 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "messages.h" +#include "ctree.h" +#include "discard.h" +#include "disk-io.h" +#include "send.h" +#include "transaction.h" +#include "sysfs.h" +#include "volumes.h" +#include "space-info.h" +#include "block-group.h" +#include "qgroup.h" +#include "misc.h" +#include "fs.h" +#include "accessors.h" + +/* + * Structure name Path + * -------------------------------------------------------------------------- + * btrfs_supported_static_feature_attrs /sys/fs/btrfs/features + * btrfs_supported_feature_attrs /sys/fs/btrfs/features and + * /sys/fs/btrfs//features + * btrfs_attrs /sys/fs/btrfs/ + * devid_attrs /sys/fs/btrfs//devinfo/ + * allocation_attrs /sys/fs/btrfs//allocation + * qgroup_attrs /sys/fs/btrfs//qgroups/_ + * space_info_attrs /sys/fs/btrfs//allocation/ + * raid_attrs /sys/fs/btrfs//allocation// + * discard_attrs /sys/fs/btrfs//discard + * + * When built with BTRFS_CONFIG_DEBUG: + * + * btrfs_debug_feature_attrs /sys/fs/btrfs/debug + * btrfs_debug_mount_attrs /sys/fs/btrfs//debug + */ + +struct btrfs_feature_attr { + struct kobj_attribute kobj_attr; + enum btrfs_feature_set feature_set; + u64 feature_bit; +}; + +/* For raid type sysfs entries */ +struct raid_kobject { + u64 flags; + struct kobject kobj; +}; + +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = { .name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +} + +#define BTRFS_ATTR_W(_prefix, _name, _store) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ + __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) + +#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ + __INIT_KOBJ_ATTR(_name, 0644, _show, _store) + +#define BTRFS_ATTR(_prefix, _name, _show) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ + __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) + +#define BTRFS_ATTR_PTR(_prefix, _name) \ + (&btrfs_attr_##_prefix##_##_name.attr) + +#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \ +static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ + .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ + btrfs_feature_attr_show, \ + btrfs_feature_attr_store), \ + .feature_set = _feature_set, \ + .feature_bit = _feature_prefix ##_## _feature_bit, \ +} +#define BTRFS_FEAT_ATTR_PTR(_name) \ + (&btrfs_attr_features_##_name.kobj_attr.attr) + +#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) +#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature) +#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) + +static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); +static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); +static struct kobject *get_btrfs_kobj(struct kobject *kobj); + +static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a) +{ + return container_of(a, struct btrfs_feature_attr, kobj_attr); +} + +static struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr) +{ + return container_of(attr, struct kobj_attribute, attr); +} + +static struct btrfs_feature_attr *attr_to_btrfs_feature_attr( + struct attribute *attr) +{ + return to_btrfs_feature_attr(attr_to_btrfs_attr(attr)); +} + +static u64 get_features(struct btrfs_fs_info *fs_info, + enum btrfs_feature_set set) +{ + struct btrfs_super_block *disk_super = fs_info->super_copy; + if (set == FEAT_COMPAT) + return btrfs_super_compat_flags(disk_super); + else if (set == FEAT_COMPAT_RO) + return btrfs_super_compat_ro_flags(disk_super); + else + return btrfs_super_incompat_flags(disk_super); +} + +static void set_features(struct btrfs_fs_info *fs_info, + enum btrfs_feature_set set, u64 features) +{ + struct btrfs_super_block *disk_super = fs_info->super_copy; + if (set == FEAT_COMPAT) + btrfs_set_super_compat_flags(disk_super, features); + else if (set == FEAT_COMPAT_RO) + btrfs_set_super_compat_ro_flags(disk_super, features); + else + btrfs_set_super_incompat_flags(disk_super, features); +} + +static int can_modify_feature(struct btrfs_feature_attr *fa) +{ + int val = 0; + u64 set, clear; + switch (fa->feature_set) { + case FEAT_COMPAT: + set = BTRFS_FEATURE_COMPAT_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; + break; + case FEAT_COMPAT_RO: + set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; + break; + case FEAT_INCOMPAT: + set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; + clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; + break; + default: + pr_warn("btrfs: sysfs: unknown feature set %d\n", + fa->feature_set); + return 0; + } + + if (set & fa->feature_bit) + val |= 1; + if (clear & fa->feature_bit) + val |= 2; + + return val; +} + +static ssize_t btrfs_feature_attr_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + int val = 0; + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); + if (fs_info) { + u64 features = get_features(fs_info, fa->feature_set); + if (features & fa->feature_bit) + val = 1; + } else + val = can_modify_feature(fa); + + return sysfs_emit(buf, "%d\n", val); +} + +static ssize_t btrfs_feature_attr_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t count) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); + u64 features, set, clear; + unsigned long val; + int ret; + + fs_info = to_fs_info(kobj); + if (!fs_info) + return -EPERM; + + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + ret = kstrtoul(skip_spaces(buf), 0, &val); + if (ret) + return ret; + + if (fa->feature_set == FEAT_COMPAT) { + set = BTRFS_FEATURE_COMPAT_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; + } else if (fa->feature_set == FEAT_COMPAT_RO) { + set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; + } else { + set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; + clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; + } + + features = get_features(fs_info, fa->feature_set); + + /* Nothing to do */ + if ((val && (features & fa->feature_bit)) || + (!val && !(features & fa->feature_bit))) + return count; + + if ((val && !(set & fa->feature_bit)) || + (!val && !(clear & fa->feature_bit))) { + btrfs_info(fs_info, + "%sabling feature %s on mounted fs is not supported.", + val ? "En" : "Dis", fa->kobj_attr.attr.name); + return -EPERM; + } + + btrfs_info(fs_info, "%s %s feature flag", + val ? "Setting" : "Clearing", fa->kobj_attr.attr.name); + + spin_lock(&fs_info->super_lock); + features = get_features(fs_info, fa->feature_set); + if (val) + features |= fa->feature_bit; + else + features &= ~fa->feature_bit; + set_features(fs_info, fa->feature_set, features); + spin_unlock(&fs_info->super_lock); + + /* + * We don't want to do full transaction commit from inside sysfs + */ + set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); + wake_up_process(fs_info->transaction_kthread); + + return count; +} + +static umode_t btrfs_feature_visible(struct kobject *kobj, + struct attribute *attr, int unused) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + umode_t mode = attr->mode; + + if (fs_info) { + struct btrfs_feature_attr *fa; + u64 features; + + fa = attr_to_btrfs_feature_attr(attr); + features = get_features(fs_info, fa->feature_set); + + if (can_modify_feature(fa)) + mode |= S_IWUSR; + else if (!(features & fa->feature_bit)) + mode = 0; + } + + return mode; +} + +BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); +BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); +BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); +BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD); +BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); +BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); +BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); +BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); +BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); +BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); +BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE); +BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); +#ifdef CONFIG_BLK_DEV_ZONED +BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); +#endif +#ifdef CONFIG_BTRFS_DEBUG +/* Remove once support for extent tree v2 is feature complete */ +BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); +#endif +#ifdef CONFIG_FS_VERITY +BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); +#endif + +/* + * Features which depend on feature bits and may differ between each fs. + * + * /sys/fs/btrfs/features - all available features implemented by this version + * /sys/fs/btrfs/UUID/features - features of the fs which are enabled or + * can be changed on a mounted filesystem. + */ +static struct attribute *btrfs_supported_feature_attrs[] = { + BTRFS_FEAT_ATTR_PTR(default_subvol), + BTRFS_FEAT_ATTR_PTR(mixed_groups), + BTRFS_FEAT_ATTR_PTR(compress_lzo), + BTRFS_FEAT_ATTR_PTR(compress_zstd), + BTRFS_FEAT_ATTR_PTR(extended_iref), + BTRFS_FEAT_ATTR_PTR(raid56), + BTRFS_FEAT_ATTR_PTR(skinny_metadata), + BTRFS_FEAT_ATTR_PTR(no_holes), + BTRFS_FEAT_ATTR_PTR(metadata_uuid), + BTRFS_FEAT_ATTR_PTR(free_space_tree), + BTRFS_FEAT_ATTR_PTR(raid1c34), + BTRFS_FEAT_ATTR_PTR(block_group_tree), +#ifdef CONFIG_BLK_DEV_ZONED + BTRFS_FEAT_ATTR_PTR(zoned), +#endif +#ifdef CONFIG_BTRFS_DEBUG + BTRFS_FEAT_ATTR_PTR(extent_tree_v2), +#endif +#ifdef CONFIG_FS_VERITY + BTRFS_FEAT_ATTR_PTR(verity), +#endif + NULL +}; + +static const struct attribute_group btrfs_feature_attr_group = { + .name = "features", + .is_visible = btrfs_feature_visible, + .attrs = btrfs_supported_feature_attrs, +}; + +static ssize_t rmdir_subvol_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "0\n"); +} +BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show); + +static ssize_t supported_checksums_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + ssize_t ret = 0; + int i; + + for (i = 0; i < btrfs_get_num_csums(); i++) { + /* + * This "trick" only works as long as 'enum btrfs_csum_type' has + * no holes in it + */ + ret += sysfs_emit_at(buf, ret, "%s%s", (i == 0 ? "" : " "), + btrfs_super_csum_name(i)); + + } + + ret += sysfs_emit_at(buf, ret, "\n"); + return ret; +} +BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); + +static ssize_t send_stream_version_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%d\n", BTRFS_SEND_STREAM_VERSION); +} +BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); + +static const char *rescue_opts[] = { + "usebackuproot", + "nologreplay", + "ignorebadroots", + "ignoredatacsums", + "all", +}; + +static ssize_t supported_rescue_options_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + ssize_t ret = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(rescue_opts); i++) + ret += sysfs_emit_at(buf, ret, "%s%s", (i ? " " : ""), rescue_opts[i]); + ret += sysfs_emit_at(buf, ret, "\n"); + return ret; +} +BTRFS_ATTR(static_feature, supported_rescue_options, + supported_rescue_options_show); + +static ssize_t supported_sectorsizes_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + ssize_t ret = 0; + + /* An artificial limit to only support 4K and PAGE_SIZE */ + if (PAGE_SIZE > SZ_4K) + ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); + ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); + + return ret; +} +BTRFS_ATTR(static_feature, supported_sectorsizes, + supported_sectorsizes_show); + +static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); +} +BTRFS_ATTR(static_feature, acl, acl_show); + +/* + * Features which only depend on kernel version. + * + * These are listed in /sys/fs/btrfs/features along with + * btrfs_supported_feature_attrs. + */ +static struct attribute *btrfs_supported_static_feature_attrs[] = { + BTRFS_ATTR_PTR(static_feature, acl), + BTRFS_ATTR_PTR(static_feature, rmdir_subvol), + BTRFS_ATTR_PTR(static_feature, supported_checksums), + BTRFS_ATTR_PTR(static_feature, send_stream_version), + BTRFS_ATTR_PTR(static_feature, supported_rescue_options), + BTRFS_ATTR_PTR(static_feature, supported_sectorsizes), + NULL +}; + +static const struct attribute_group btrfs_static_feature_attr_group = { + .name = "features", + .attrs = btrfs_supported_static_feature_attrs, +}; + +/* + * Discard statistics and tunables + */ +#define discard_to_fs_info(_kobj) to_fs_info(get_btrfs_kobj(_kobj)) + +static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return sysfs_emit(buf, "%lld\n", + atomic64_read(&fs_info->discard_ctl.discardable_bytes)); +} +BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show); + +static ssize_t btrfs_discardable_extents_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return sysfs_emit(buf, "%d\n", + atomic_read(&fs_info->discard_ctl.discardable_extents)); +} +BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show); + +static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return sysfs_emit(buf, "%llu\n", + fs_info->discard_ctl.discard_bitmap_bytes); +} +BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show); + +static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return sysfs_emit(buf, "%lld\n", + atomic64_read(&fs_info->discard_ctl.discard_bytes_saved)); +} +BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show); + +static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return sysfs_emit(buf, "%llu\n", + fs_info->discard_ctl.discard_extent_bytes); +} +BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show); + +static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return sysfs_emit(buf, "%u\n", + READ_ONCE(fs_info->discard_ctl.iops_limit)); +} + +static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; + u32 iops_limit; + int ret; + + ret = kstrtou32(buf, 10, &iops_limit); + if (ret) + return -EINVAL; + + WRITE_ONCE(discard_ctl->iops_limit, iops_limit); + btrfs_discard_calc_delay(discard_ctl); + btrfs_discard_schedule_work(discard_ctl, true); + return len; +} +BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show, + btrfs_discard_iops_limit_store); + +static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return sysfs_emit(buf, "%u\n", + READ_ONCE(fs_info->discard_ctl.kbps_limit)); +} + +static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; + u32 kbps_limit; + int ret; + + ret = kstrtou32(buf, 10, &kbps_limit); + if (ret) + return -EINVAL; + + WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit); + btrfs_discard_schedule_work(discard_ctl, true); + return len; +} +BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show, + btrfs_discard_kbps_limit_store); + +static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return sysfs_emit(buf, "%llu\n", + READ_ONCE(fs_info->discard_ctl.max_discard_size)); +} + +static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; + u64 max_discard_size; + int ret; + + ret = kstrtou64(buf, 10, &max_discard_size); + if (ret) + return -EINVAL; + + WRITE_ONCE(discard_ctl->max_discard_size, max_discard_size); + + return len; +} +BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show, + btrfs_discard_max_discard_size_store); + +/* + * Per-filesystem stats for discard (when mounted with discard=async). + * + * Path: /sys/fs/btrfs//discard/ + */ +static const struct attribute *discard_attrs[] = { + BTRFS_ATTR_PTR(discard, discardable_bytes), + BTRFS_ATTR_PTR(discard, discardable_extents), + BTRFS_ATTR_PTR(discard, discard_bitmap_bytes), + BTRFS_ATTR_PTR(discard, discard_bytes_saved), + BTRFS_ATTR_PTR(discard, discard_extent_bytes), + BTRFS_ATTR_PTR(discard, iops_limit), + BTRFS_ATTR_PTR(discard, kbps_limit), + BTRFS_ATTR_PTR(discard, max_discard_size), + NULL, +}; + +#ifdef CONFIG_BTRFS_DEBUG + +/* + * Per-filesystem runtime debugging exported via sysfs. + * + * Path: /sys/fs/btrfs/UUID/debug/ + */ +static const struct attribute *btrfs_debug_mount_attrs[] = { + NULL, +}; + +/* + * Runtime debugging exported via sysfs, applies to all mounted filesystems. + * + * Path: /sys/fs/btrfs/debug + */ +static struct attribute *btrfs_debug_feature_attrs[] = { + NULL +}; + +static const struct attribute_group btrfs_debug_feature_attr_group = { + .name = "debug", + .attrs = btrfs_debug_feature_attrs, +}; + +#endif + +static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) +{ + u64 val; + if (lock) + spin_lock(lock); + val = *value_ptr; + if (lock) + spin_unlock(lock); + return sysfs_emit(buf, "%llu\n", val); +} + +static ssize_t global_rsv_size_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); +} +BTRFS_ATTR(allocation, global_rsv_size, global_rsv_size_show); + +static ssize_t global_rsv_reserved_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); +} +BTRFS_ATTR(allocation, global_rsv_reserved, global_rsv_reserved_show); + +#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) +#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) + +static ssize_t raid_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf); +BTRFS_ATTR(raid, total_bytes, raid_bytes_show); +BTRFS_ATTR(raid, used_bytes, raid_bytes_show); + +static ssize_t raid_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) + +{ + struct btrfs_space_info *sinfo = to_space_info(kobj->parent); + struct btrfs_block_group *block_group; + int index = btrfs_bg_flags_to_raid_index(to_raid_kobj(kobj)->flags); + u64 val = 0; + + down_read(&sinfo->groups_sem); + list_for_each_entry(block_group, &sinfo->block_groups[index], list) { + if (&attr->attr == BTRFS_ATTR_PTR(raid, total_bytes)) + val += block_group->length; + else + val += block_group->used; + } + up_read(&sinfo->groups_sem); + return sysfs_emit(buf, "%llu\n", val); +} + +/* + * Allocation information about block group profiles. + * + * Path: /sys/fs/btrfs//allocation/// + */ +static struct attribute *raid_attrs[] = { + BTRFS_ATTR_PTR(raid, total_bytes), + BTRFS_ATTR_PTR(raid, used_bytes), + NULL +}; +ATTRIBUTE_GROUPS(raid); + +static void release_raid_kobj(struct kobject *kobj) +{ + kfree(to_raid_kobj(kobj)); +} + +static const struct kobj_type btrfs_raid_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = release_raid_kobj, + .default_groups = raid_groups, +}; + +#define SPACE_INFO_ATTR(field) \ +static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ + struct kobj_attribute *a, \ + char *buf) \ +{ \ + struct btrfs_space_info *sinfo = to_space_info(kobj); \ + return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ +} \ +BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field) + +static ssize_t btrfs_chunk_size_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + + return sysfs_emit(buf, "%llu\n", READ_ONCE(sinfo->chunk_size)); +} + +/* + * Store new chunk size in space info. Can be called on a read-only filesystem. + * + * If the new chunk size value is larger than 10% of free space it is reduced + * to match that limit. Alignment must be to 256M and the system chunk size + * cannot be set. + */ +static ssize_t btrfs_chunk_size_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); + char *retptr; + u64 val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!fs_info->fs_devices) + return -EINVAL; + + if (btrfs_is_zoned(fs_info)) + return -EINVAL; + + /* System block type must not be changed. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) + return -EPERM; + + val = memparse(buf, &retptr); + /* There could be trailing '\n', also catch any typos after the value */ + retptr = skip_spaces(retptr); + if (*retptr != 0 || val == 0) + return -EINVAL; + + val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE); + + /* Limit stripe size to 10% of available space. */ + val = min(mult_perc(fs_info->fs_devices->total_rw_bytes, 10), val); + + /* Must be multiple of 256M. */ + val &= ~((u64)SZ_256M - 1); + + /* Must be at least 256M. */ + if (val < SZ_256M) + return -EINVAL; + + btrfs_update_space_info_chunk_size(space_info, val); + + return len; +} + +static ssize_t btrfs_size_classes_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + struct btrfs_block_group *bg; + u32 none = 0; + u32 small = 0; + u32 medium = 0; + u32 large = 0; + + for (int i = 0; i < BTRFS_NR_RAID_TYPES; ++i) { + down_read(&sinfo->groups_sem); + list_for_each_entry(bg, &sinfo->block_groups[i], list) { + if (!btrfs_block_group_should_use_size_class(bg)) + continue; + switch (bg->size_class) { + case BTRFS_BG_SZ_NONE: + none++; + break; + case BTRFS_BG_SZ_SMALL: + small++; + break; + case BTRFS_BG_SZ_MEDIUM: + medium++; + break; + case BTRFS_BG_SZ_LARGE: + large++; + break; + } + } + up_read(&sinfo->groups_sem); + } + return sysfs_emit(buf, "none %u\n" + "small %u\n" + "medium %u\n" + "large %u\n", + none, small, medium, large); +} + +#ifdef CONFIG_BTRFS_DEBUG +/* + * Request chunk allocation with current chunk size. + */ +static ssize_t btrfs_force_chunk_alloc_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); + struct btrfs_trans_handle *trans; + bool val; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + ret = kstrtobool(buf, &val); + if (ret) + return ret; + + if (!val) + return -EINVAL; + + /* + * This is unsafe to be called from sysfs context and may cause + * unexpected problems. + */ + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_force_chunk_alloc(trans, space_info->flags); + btrfs_end_transaction(trans); + + if (ret == 1) + return len; + + return -ENOSPC; +} +BTRFS_ATTR_W(space_info, force_chunk_alloc, btrfs_force_chunk_alloc_store); + +#endif + +SPACE_INFO_ATTR(flags); +SPACE_INFO_ATTR(total_bytes); +SPACE_INFO_ATTR(bytes_used); +SPACE_INFO_ATTR(bytes_pinned); +SPACE_INFO_ATTR(bytes_reserved); +SPACE_INFO_ATTR(bytes_may_use); +SPACE_INFO_ATTR(bytes_readonly); +SPACE_INFO_ATTR(bytes_zone_unusable); +SPACE_INFO_ATTR(disk_used); +SPACE_INFO_ATTR(disk_total); +BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store); +BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show); + +static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + + return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold)); +} + +static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + int thresh; + int ret; + + ret = kstrtoint(buf, 10, &thresh); + if (ret) + return ret; + + if (thresh < 0 || thresh > 100) + return -EINVAL; + + WRITE_ONCE(space_info->bg_reclaim_threshold, thresh); + + return len; +} + +BTRFS_ATTR_RW(space_info, bg_reclaim_threshold, + btrfs_sinfo_bg_reclaim_threshold_show, + btrfs_sinfo_bg_reclaim_threshold_store); + +/* + * Allocation information about block group types. + * + * Path: /sys/fs/btrfs//allocation// + */ +static struct attribute *space_info_attrs[] = { + BTRFS_ATTR_PTR(space_info, flags), + BTRFS_ATTR_PTR(space_info, total_bytes), + BTRFS_ATTR_PTR(space_info, bytes_used), + BTRFS_ATTR_PTR(space_info, bytes_pinned), + BTRFS_ATTR_PTR(space_info, bytes_reserved), + BTRFS_ATTR_PTR(space_info, bytes_may_use), + BTRFS_ATTR_PTR(space_info, bytes_readonly), + BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), + BTRFS_ATTR_PTR(space_info, disk_used), + BTRFS_ATTR_PTR(space_info, disk_total), + BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), + BTRFS_ATTR_PTR(space_info, chunk_size), + BTRFS_ATTR_PTR(space_info, size_classes), +#ifdef CONFIG_BTRFS_DEBUG + BTRFS_ATTR_PTR(space_info, force_chunk_alloc), +#endif + NULL, +}; +ATTRIBUTE_GROUPS(space_info); + +static void space_info_release(struct kobject *kobj) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + kfree(sinfo); +} + +static const struct kobj_type space_info_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = space_info_release, + .default_groups = space_info_groups, +}; + +/* + * Allocation information about block groups. + * + * Path: /sys/fs/btrfs//allocation/ + */ +static const struct attribute *allocation_attrs[] = { + BTRFS_ATTR_PTR(allocation, global_rsv_reserved), + BTRFS_ATTR_PTR(allocation, global_rsv_size), + NULL, +}; + +static ssize_t btrfs_label_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + char *label = fs_info->super_copy->label; + ssize_t ret; + + spin_lock(&fs_info->super_lock); + ret = sysfs_emit(buf, label[0] ? "%s\n" : "%s", label); + spin_unlock(&fs_info->super_lock); + + return ret; +} + +static ssize_t btrfs_label_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + size_t p_len; + + if (!fs_info) + return -EPERM; + + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + /* + * p_len is the len until the first occurrence of either + * '\n' or '\0' + */ + p_len = strcspn(buf, "\n"); + + if (p_len >= BTRFS_LABEL_SIZE) + return -EINVAL; + + spin_lock(&fs_info->super_lock); + memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE); + memcpy(fs_info->super_copy->label, buf, p_len); + spin_unlock(&fs_info->super_lock); + + /* + * We don't want to do full transaction commit from inside sysfs + */ + set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); + wake_up_process(fs_info->transaction_kthread); + + return len; +} +BTRFS_ATTR_RW(, label, btrfs_label_show, btrfs_label_store); + +static ssize_t btrfs_nodesize_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize); +} + +BTRFS_ATTR(, nodesize, btrfs_nodesize_show); + +static ssize_t btrfs_sectorsize_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); +} + +BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); + +static ssize_t btrfs_commit_stats_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return sysfs_emit(buf, + "commits %llu\n" + "last_commit_ms %llu\n" + "max_commit_ms %llu\n" + "total_commit_ms %llu\n", + fs_info->commit_stats.commit_count, + div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC), + div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC), + div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC)); +} + +static ssize_t btrfs_commit_stats_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + unsigned long val; + int ret; + + if (!fs_info) + return -EPERM; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + ret = kstrtoul(buf, 10, &val); + if (ret) + return ret; + if (val) + return -EINVAL; + + WRITE_ONCE(fs_info->commit_stats.max_commit_dur, 0); + + return len; +} +BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store); + +static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); +} + +BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); + +static ssize_t quota_override_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + int quota_override; + + quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + return sysfs_emit(buf, "%d\n", quota_override); +} + +static ssize_t quota_override_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + unsigned long knob; + int err; + + if (!fs_info) + return -EPERM; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + err = kstrtoul(buf, 10, &knob); + if (err) + return err; + if (knob > 1) + return -EINVAL; + + if (knob) + set_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + else + clear_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + + return len; +} + +BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store); + +static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return sysfs_emit(buf, "%pU\n", fs_info->fs_devices->metadata_uuid); +} + +BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show); + +static ssize_t btrfs_checksum_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + u16 csum_type = btrfs_super_csum_type(fs_info->super_copy); + + return sysfs_emit(buf, "%s (%s)\n", + btrfs_super_csum_name(csum_type), + crypto_shash_driver_name(fs_info->csum_shash)); +} + +BTRFS_ATTR(, checksum, btrfs_checksum_show); + +static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + const char *str; + + switch (READ_ONCE(fs_info->exclusive_operation)) { + case BTRFS_EXCLOP_NONE: + str = "none\n"; + break; + case BTRFS_EXCLOP_BALANCE: + str = "balance\n"; + break; + case BTRFS_EXCLOP_BALANCE_PAUSED: + str = "balance paused\n"; + break; + case BTRFS_EXCLOP_DEV_ADD: + str = "device add\n"; + break; + case BTRFS_EXCLOP_DEV_REMOVE: + str = "device remove\n"; + break; + case BTRFS_EXCLOP_DEV_REPLACE: + str = "device replace\n"; + break; + case BTRFS_EXCLOP_RESIZE: + str = "resize\n"; + break; + case BTRFS_EXCLOP_SWAP_ACTIVATE: + str = "swap activate\n"; + break; + default: + str = "UNKNOWN\n"; + break; + } + return sysfs_emit(buf, "%s", str); +} +BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show); + +static ssize_t btrfs_generation_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return sysfs_emit(buf, "%llu\n", fs_info->generation); +} +BTRFS_ATTR(, generation, btrfs_generation_show); + +static const char * const btrfs_read_policy_name[] = { "pid" }; + +static ssize_t btrfs_read_policy_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + ssize_t ret = 0; + int i; + + for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { + if (fs_devices->read_policy == i) + ret += sysfs_emit_at(buf, ret, "%s[%s]", + (ret == 0 ? "" : " "), + btrfs_read_policy_name[i]); + else + ret += sysfs_emit_at(buf, ret, "%s%s", + (ret == 0 ? "" : " "), + btrfs_read_policy_name[i]); + } + + ret += sysfs_emit_at(buf, ret, "\n"); + + return ret; +} + +static ssize_t btrfs_read_policy_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + int i; + + for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { + if (sysfs_streq(buf, btrfs_read_policy_name[i])) { + if (i != fs_devices->read_policy) { + fs_devices->read_policy = i; + btrfs_info(fs_devices->fs_info, + "read policy set to '%s'", + btrfs_read_policy_name[i]); + } + return len; + } + } + + return -EINVAL; +} +BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); + +static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold)); +} + +static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + int thresh; + int ret; + + ret = kstrtoint(buf, 10, &thresh); + if (ret) + return ret; + +#ifdef CONFIG_BTRFS_DEBUG + if (thresh != 0 && (thresh > 100)) + return -EINVAL; +#else + if (thresh != 0 && (thresh <= 50 || thresh > 100)) + return -EINVAL; +#endif + + WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh); + + return len; +} +BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, + btrfs_bg_reclaim_threshold_store); + +/* + * Per-filesystem information and stats. + * + * Path: /sys/fs/btrfs// + */ +static const struct attribute *btrfs_attrs[] = { + BTRFS_ATTR_PTR(, label), + BTRFS_ATTR_PTR(, nodesize), + BTRFS_ATTR_PTR(, sectorsize), + BTRFS_ATTR_PTR(, clone_alignment), + BTRFS_ATTR_PTR(, quota_override), + BTRFS_ATTR_PTR(, metadata_uuid), + BTRFS_ATTR_PTR(, checksum), + BTRFS_ATTR_PTR(, exclusive_operation), + BTRFS_ATTR_PTR(, generation), + BTRFS_ATTR_PTR(, read_policy), + BTRFS_ATTR_PTR(, bg_reclaim_threshold), + BTRFS_ATTR_PTR(, commit_stats), + NULL, +}; + +static void btrfs_release_fsid_kobj(struct kobject *kobj) +{ + struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj); + + memset(&fs_devs->fsid_kobj, 0, sizeof(struct kobject)); + complete(&fs_devs->kobj_unregister); +} + +static const struct kobj_type btrfs_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = btrfs_release_fsid_kobj, +}; + +static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj) +{ + if (kobj->ktype != &btrfs_ktype) + return NULL; + return container_of(kobj, struct btrfs_fs_devices, fsid_kobj); +} + +static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) +{ + if (kobj->ktype != &btrfs_ktype) + return NULL; + return to_fs_devs(kobj)->fs_info; +} + +static struct kobject *get_btrfs_kobj(struct kobject *kobj) +{ + while (kobj) { + if (kobj->ktype == &btrfs_ktype) + return kobj; + kobj = kobj->parent; + } + return NULL; +} + +#define NUM_FEATURE_BITS 64 +#define BTRFS_FEATURE_NAME_MAX 13 +static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX]; +static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS]; + +static_assert(ARRAY_SIZE(btrfs_unknown_feature_names) == + ARRAY_SIZE(btrfs_feature_attrs)); +static_assert(ARRAY_SIZE(btrfs_unknown_feature_names[0]) == + ARRAY_SIZE(btrfs_feature_attrs[0])); + +static const u64 supported_feature_masks[FEAT_MAX] = { + [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, + [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, + [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP, +}; + +static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) +{ + int set; + + for (set = 0; set < FEAT_MAX; set++) { + int i; + struct attribute *attrs[2]; + struct attribute_group agroup = { + .name = "features", + .attrs = attrs, + }; + u64 features = get_features(fs_info, set); + features &= ~supported_feature_masks[set]; + + if (!features) + continue; + + attrs[1] = NULL; + for (i = 0; i < NUM_FEATURE_BITS; i++) { + struct btrfs_feature_attr *fa; + + if (!(features & (1ULL << i))) + continue; + + fa = &btrfs_feature_attrs[set][i]; + attrs[0] = &fa->kobj_attr.attr; + if (add) { + int ret; + ret = sysfs_merge_group(&fs_info->fs_devices->fsid_kobj, + &agroup); + if (ret) + return ret; + } else + sysfs_unmerge_group(&fs_info->fs_devices->fsid_kobj, + &agroup); + } + + } + return 0; +} + +static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) +{ + if (fs_devs->devinfo_kobj) { + kobject_del(fs_devs->devinfo_kobj); + kobject_put(fs_devs->devinfo_kobj); + fs_devs->devinfo_kobj = NULL; + } + + if (fs_devs->devices_kobj) { + kobject_del(fs_devs->devices_kobj); + kobject_put(fs_devs->devices_kobj); + fs_devs->devices_kobj = NULL; + } + + if (fs_devs->fsid_kobj.state_initialized) { + kobject_del(&fs_devs->fsid_kobj); + kobject_put(&fs_devs->fsid_kobj); + wait_for_completion(&fs_devs->kobj_unregister); + } +} + +/* when fs_devs is NULL it will remove all fsid kobject */ +void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) +{ + struct list_head *fs_uuids = btrfs_get_fs_uuids(); + + if (fs_devs) { + __btrfs_sysfs_remove_fsid(fs_devs); + return; + } + + list_for_each_entry(fs_devs, fs_uuids, fs_list) { + __btrfs_sysfs_remove_fsid(fs_devs); + } +} + +static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *seed; + + list_for_each_entry(device, &fs_devices->devices, dev_list) + btrfs_sysfs_remove_device(device); + + list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed->devices, dev_list) + btrfs_sysfs_remove_device(device); + } +} + +void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) +{ + struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; + + sysfs_remove_link(fsid_kobj, "bdi"); + + if (fs_info->space_info_kobj) { + sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); + kobject_del(fs_info->space_info_kobj); + kobject_put(fs_info->space_info_kobj); + } + if (fs_info->discard_kobj) { + sysfs_remove_files(fs_info->discard_kobj, discard_attrs); + kobject_del(fs_info->discard_kobj); + kobject_put(fs_info->discard_kobj); + } +#ifdef CONFIG_BTRFS_DEBUG + if (fs_info->debug_kobj) { + sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); + kobject_del(fs_info->debug_kobj); + kobject_put(fs_info->debug_kobj); + } +#endif + addrm_unknown_feature_attrs(fs_info, false); + sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); + sysfs_remove_files(fsid_kobj, btrfs_attrs); + btrfs_sysfs_remove_fs_devices(fs_info->fs_devices); +} + +static const char * const btrfs_feature_set_names[FEAT_MAX] = { + [FEAT_COMPAT] = "compat", + [FEAT_COMPAT_RO] = "compat_ro", + [FEAT_INCOMPAT] = "incompat", +}; + +const char *btrfs_feature_set_name(enum btrfs_feature_set set) +{ + return btrfs_feature_set_names[set]; +} + +char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags) +{ + size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */ + int len = 0; + int i; + char *str; + + str = kmalloc(bufsize, GFP_KERNEL); + if (!str) + return str; + + for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { + const char *name; + + if (!(flags & (1ULL << i))) + continue; + + name = btrfs_feature_attrs[set][i].kobj_attr.attr.name; + len += scnprintf(str + len, bufsize - len, "%s%s", + len ? "," : "", name); + } + + return str; +} + +static void init_feature_attrs(void) +{ + struct btrfs_feature_attr *fa; + int set, i; + + memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs)); + memset(btrfs_unknown_feature_names, 0, + sizeof(btrfs_unknown_feature_names)); + + for (i = 0; btrfs_supported_feature_attrs[i]; i++) { + struct btrfs_feature_attr *sfa; + struct attribute *a = btrfs_supported_feature_attrs[i]; + int bit; + sfa = attr_to_btrfs_feature_attr(a); + bit = ilog2(sfa->feature_bit); + fa = &btrfs_feature_attrs[sfa->feature_set][bit]; + + fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name; + } + + for (set = 0; set < FEAT_MAX; set++) { + for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { + char *name = btrfs_unknown_feature_names[set][i]; + fa = &btrfs_feature_attrs[set][i]; + + if (fa->kobj_attr.attr.name) + continue; + + snprintf(name, BTRFS_FEATURE_NAME_MAX, "%s:%u", + btrfs_feature_set_names[set], i); + + fa->kobj_attr.attr.name = name; + fa->kobj_attr.attr.mode = S_IRUGO; + fa->feature_set = set; + fa->feature_bit = 1ULL << i; + } + } +} + +/* + * Create a sysfs entry for a given block group type at path + * /sys/fs/btrfs/UUID/allocation/data/TYPE + */ +void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_space_info *space_info = cache->space_info; + struct raid_kobject *rkobj; + const int index = btrfs_bg_flags_to_raid_index(cache->flags); + unsigned int nofs_flag; + int ret; + + /* + * Setup a NOFS context because kobject_add(), deep in its call chain, + * does GFP_KERNEL allocations, and we are often called in a context + * where if reclaim is triggered we can deadlock (we are either holding + * a transaction handle or some lock required for a transaction + * commit). + */ + nofs_flag = memalloc_nofs_save(); + + rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); + if (!rkobj) { + memalloc_nofs_restore(nofs_flag); + btrfs_warn(cache->fs_info, + "couldn't alloc memory for raid level kobject"); + return; + } + + rkobj->flags = cache->flags; + kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + + /* + * We call this either on mount, or if we've created a block group for a + * new index type while running (i.e. when restriping). The running + * case is tricky because we could race with other threads, so we need + * to have this check to make sure we didn't already init the kobject. + * + * We don't have to protect on the free side because it only happens on + * unmount. + */ + spin_lock(&space_info->lock); + if (space_info->block_group_kobjs[index]) { + spin_unlock(&space_info->lock); + kobject_put(&rkobj->kobj); + return; + } else { + space_info->block_group_kobjs[index] = &rkobj->kobj; + } + spin_unlock(&space_info->lock); + + ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", + btrfs_bg_type_to_raid_name(rkobj->flags)); + memalloc_nofs_restore(nofs_flag); + if (ret) { + spin_lock(&space_info->lock); + space_info->block_group_kobjs[index] = NULL; + spin_unlock(&space_info->lock); + kobject_put(&rkobj->kobj); + btrfs_warn(fs_info, + "failed to add kobject for block cache, ignoring"); + return; + } +} + +/* + * Remove sysfs directories for all block group types of a given space info and + * the space info as well + */ +void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info) +{ + int i; + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + struct kobject *kobj; + + kobj = space_info->block_group_kobjs[i]; + space_info->block_group_kobjs[i] = NULL; + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } + } + kobject_del(&space_info->kobj); + kobject_put(&space_info->kobj); +} + +static const char *alloc_name(u64 flags) +{ + switch (flags) { + case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: + return "mixed"; + case BTRFS_BLOCK_GROUP_METADATA: + return "metadata"; + case BTRFS_BLOCK_GROUP_DATA: + return "data"; + case BTRFS_BLOCK_GROUP_SYSTEM: + return "system"; + default: + WARN_ON(1); + return "invalid-combination"; + } +} + +/* + * Create a sysfs entry for a space info type at path + * /sys/fs/btrfs/UUID/allocation/TYPE + */ +int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + int ret; + + ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, + fs_info->space_info_kobj, "%s", + alloc_name(space_info->flags)); + if (ret) { + kobject_put(&space_info->kobj); + return ret; + } + + return 0; +} + +void btrfs_sysfs_remove_device(struct btrfs_device *device) +{ + struct kobject *devices_kobj; + + /* + * Seed fs_devices devices_kobj aren't used, fetch kobject from the + * fs_info::fs_devices. + */ + devices_kobj = device->fs_info->fs_devices->devices_kobj; + ASSERT(devices_kobj); + + if (device->bdev) + sysfs_remove_link(devices_kobj, bdev_kobj(device->bdev)->name); + + if (device->devid_kobj.state_initialized) { + kobject_del(&device->devid_kobj); + kobject_put(&device->devid_kobj); + wait_for_completion(&device->kobj_unregister); + } +} + +static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + int val; + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + + return sysfs_emit(buf, "%d\n", val); +} +BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show); + +static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + int val; + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); + + return sysfs_emit(buf, "%d\n", val); +} +BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show); + +static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + int val; + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); + + return sysfs_emit(buf, "%d\n", val); +} +BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); + +static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + return sysfs_emit(buf, "%llu\n", READ_ONCE(device->scrub_speed_max)); +} + +static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + char *endptr; + unsigned long long limit; + + limit = memparse(buf, &endptr); + /* There could be trailing '\n', also catch any typos after the value. */ + endptr = skip_spaces(endptr); + if (*endptr != 0) + return -EINVAL; + WRITE_ONCE(device->scrub_speed_max, limit); + return len; +} +BTRFS_ATTR_RW(devid, scrub_speed_max, btrfs_devinfo_scrub_speed_max_show, + btrfs_devinfo_scrub_speed_max_store); + +static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + int val; + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + + return sysfs_emit(buf, "%d\n", val); +} +BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); + +static ssize_t btrfs_devinfo_fsid_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + return sysfs_emit(buf, "%pU\n", device->fs_devices->fsid); +} +BTRFS_ATTR(devid, fsid, btrfs_devinfo_fsid_show); + +static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + if (!device->dev_stats_valid) + return sysfs_emit(buf, "invalid\n"); + + /* + * Print all at once so we get a snapshot of all values from the same + * time. Keep them in sync and in order of definition of + * btrfs_dev_stat_values. + */ + return sysfs_emit(buf, + "write_errs %d\n" + "read_errs %d\n" + "flush_errs %d\n" + "corruption_errs %d\n" + "generation_errs %d\n", + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_WRITE_ERRS), + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_READ_ERRS), + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_FLUSH_ERRS), + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_GENERATION_ERRS)); +} +BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); + +/* + * Information about one device. + * + * Path: /sys/fs/btrfs//devinfo// + */ +static struct attribute *devid_attrs[] = { + BTRFS_ATTR_PTR(devid, error_stats), + BTRFS_ATTR_PTR(devid, fsid), + BTRFS_ATTR_PTR(devid, in_fs_metadata), + BTRFS_ATTR_PTR(devid, missing), + BTRFS_ATTR_PTR(devid, replace_target), + BTRFS_ATTR_PTR(devid, scrub_speed_max), + BTRFS_ATTR_PTR(devid, writeable), + NULL +}; +ATTRIBUTE_GROUPS(devid); + +static void btrfs_release_devid_kobj(struct kobject *kobj) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + memset(&device->devid_kobj, 0, sizeof(struct kobject)); + complete(&device->kobj_unregister); +} + +static const struct kobj_type devid_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = devid_groups, + .release = btrfs_release_devid_kobj, +}; + +int btrfs_sysfs_add_device(struct btrfs_device *device) +{ + int ret; + unsigned int nofs_flag; + struct kobject *devices_kobj; + struct kobject *devinfo_kobj; + + /* + * Make sure we use the fs_info::fs_devices to fetch the kobjects even + * for the seed fs_devices + */ + devices_kobj = device->fs_info->fs_devices->devices_kobj; + devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj; + ASSERT(devices_kobj); + ASSERT(devinfo_kobj); + + nofs_flag = memalloc_nofs_save(); + + if (device->bdev) { + struct kobject *disk_kobj = bdev_kobj(device->bdev); + + ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name); + if (ret) { + btrfs_warn(device->fs_info, + "creating sysfs device link for devid %llu failed: %d", + device->devid, ret); + goto out; + } + } + + init_completion(&device->kobj_unregister); + ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype, + devinfo_kobj, "%llu", device->devid); + if (ret) { + kobject_put(&device->devid_kobj); + btrfs_warn(device->fs_info, + "devinfo init for devid %llu failed: %d", + device->devid, ret); + } + +out: + memalloc_nofs_restore(nofs_flag); + return ret; +} + +static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + int ret; + struct btrfs_device *device; + struct btrfs_fs_devices *seed; + + list_for_each_entry(device, &fs_devices->devices, dev_list) { + ret = btrfs_sysfs_add_device(device); + if (ret) + goto fail; + } + + list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed->devices, dev_list) { + ret = btrfs_sysfs_add_device(device); + if (ret) + goto fail; + } + } + + return 0; + +fail: + btrfs_sysfs_remove_fs_devices(fs_devices); + return ret; +} + +void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) +{ + int ret; + + ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); + if (ret) + pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", + action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), + &disk_to_dev(bdev->bd_disk)->kobj); +} + +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices) + +{ + char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; + + /* + * Sprouting changes fsid of the mounted filesystem, rename the fsid + * directory + */ + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid); + if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) + btrfs_warn(fs_devices->fs_info, + "sysfs: failed to create fsid for sprout"); +} + +void btrfs_sysfs_update_devid(struct btrfs_device *device) +{ + char tmp[24]; + + snprintf(tmp, sizeof(tmp), "%llu", device->devid); + + if (kobject_rename(&device->devid_kobj, tmp)) + btrfs_warn(device->fs_devices->fs_info, + "sysfs: failed to update devid for %llu", + device->devid); +} + +/* /sys/fs/btrfs/ entry */ +static struct kset *btrfs_kset; + +/* + * Creates: + * /sys/fs/btrfs/UUID + * + * Can be called by the device discovery thread. + */ +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) +{ + int error; + + init_completion(&fs_devs->kobj_unregister); + fs_devs->fsid_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL, + "%pU", fs_devs->fsid); + if (error) { + kobject_put(&fs_devs->fsid_kobj); + return error; + } + + fs_devs->devices_kobj = kobject_create_and_add("devices", + &fs_devs->fsid_kobj); + if (!fs_devs->devices_kobj) { + btrfs_err(fs_devs->fs_info, + "failed to init sysfs device interface"); + btrfs_sysfs_remove_fsid(fs_devs); + return -ENOMEM; + } + + fs_devs->devinfo_kobj = kobject_create_and_add("devinfo", + &fs_devs->fsid_kobj); + if (!fs_devs->devinfo_kobj) { + btrfs_err(fs_devs->fs_info, + "failed to init sysfs devinfo kobject"); + btrfs_sysfs_remove_fsid(fs_devs); + return -ENOMEM; + } + + return 0; +} + +int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) +{ + int error; + struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; + struct kobject *fsid_kobj = &fs_devs->fsid_kobj; + + error = btrfs_sysfs_add_fs_devices(fs_devs); + if (error) + return error; + + error = sysfs_create_files(fsid_kobj, btrfs_attrs); + if (error) { + btrfs_sysfs_remove_fs_devices(fs_devs); + return error; + } + + error = sysfs_create_group(fsid_kobj, + &btrfs_feature_attr_group); + if (error) + goto failure; + +#ifdef CONFIG_BTRFS_DEBUG + fs_info->debug_kobj = kobject_create_and_add("debug", fsid_kobj); + if (!fs_info->debug_kobj) { + error = -ENOMEM; + goto failure; + } + + error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); + if (error) + goto failure; +#endif + + /* Discard directory */ + fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj); + if (!fs_info->discard_kobj) { + error = -ENOMEM; + goto failure; + } + + error = sysfs_create_files(fs_info->discard_kobj, discard_attrs); + if (error) + goto failure; + + error = addrm_unknown_feature_attrs(fs_info, true); + if (error) + goto failure; + + error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi"); + if (error) + goto failure; + + fs_info->space_info_kobj = kobject_create_and_add("allocation", + fsid_kobj); + if (!fs_info->space_info_kobj) { + error = -ENOMEM; + goto failure; + } + + error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs); + if (error) + goto failure; + + return 0; +failure: + btrfs_sysfs_remove_mounted(fs_info); + return error; +} + +static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + bool enabled; + + spin_lock(&fs_info->qgroup_lock); + enabled = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON; + spin_unlock(&fs_info->qgroup_lock); + + return sysfs_emit(buf, "%d\n", enabled); +} +BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show); + +static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + bool inconsistent; + + spin_lock(&fs_info->qgroup_lock); + inconsistent = (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT); + spin_unlock(&fs_info->qgroup_lock); + + return sysfs_emit(buf, "%d\n", inconsistent); +} +BTRFS_ATTR(qgroups, inconsistent, qgroup_inconsistent_show); + +static ssize_t qgroup_drop_subtree_thres_show(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + u8 result; + + spin_lock(&fs_info->qgroup_lock); + result = fs_info->qgroup_drop_subtree_thres; + spin_unlock(&fs_info->qgroup_lock); + + return sysfs_emit(buf, "%d\n", result); +} + +static ssize_t qgroup_drop_subtree_thres_store(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + u8 new_thres; + int ret; + + ret = kstrtou8(buf, 10, &new_thres); + if (ret) + return -EINVAL; + + if (new_thres > BTRFS_MAX_LEVEL) + return -EINVAL; + + spin_lock(&fs_info->qgroup_lock); + fs_info->qgroup_drop_subtree_thres = new_thres; + spin_unlock(&fs_info->qgroup_lock); + + return len; +} +BTRFS_ATTR_RW(qgroups, drop_subtree_threshold, qgroup_drop_subtree_thres_show, + qgroup_drop_subtree_thres_store); + +/* + * Qgroups global info + * + * Path: /sys/fs/btrfs//qgroups/ + */ +static struct attribute *qgroups_attrs[] = { + BTRFS_ATTR_PTR(qgroups, enabled), + BTRFS_ATTR_PTR(qgroups, inconsistent), + BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold), + NULL +}; +ATTRIBUTE_GROUPS(qgroups); + +static void qgroups_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static const struct kobj_type qgroups_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = qgroups_groups, + .release = qgroups_release, +}; + +static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj) +{ + return to_fs_info(kobj->parent->parent); +} + +#define QGROUP_ATTR(_member, _show_name) \ +static ssize_t btrfs_qgroup_show_##_member(struct kobject *qgroup_kobj, \ + struct kobj_attribute *a, \ + char *buf) \ +{ \ + struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \ + struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \ + struct btrfs_qgroup, kobj); \ + return btrfs_show_u64(&qgroup->_member, &fs_info->qgroup_lock, buf); \ +} \ +BTRFS_ATTR(qgroup, _show_name, btrfs_qgroup_show_##_member) + +#define QGROUP_RSV_ATTR(_name, _type) \ +static ssize_t btrfs_qgroup_rsv_show_##_name(struct kobject *qgroup_kobj, \ + struct kobj_attribute *a, \ + char *buf) \ +{ \ + struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \ + struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \ + struct btrfs_qgroup, kobj); \ + return btrfs_show_u64(&qgroup->rsv.values[_type], \ + &fs_info->qgroup_lock, buf); \ +} \ +BTRFS_ATTR(qgroup, rsv_##_name, btrfs_qgroup_rsv_show_##_name) + +QGROUP_ATTR(rfer, referenced); +QGROUP_ATTR(excl, exclusive); +QGROUP_ATTR(max_rfer, max_referenced); +QGROUP_ATTR(max_excl, max_exclusive); +QGROUP_ATTR(lim_flags, limit_flags); +QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA); +QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS); +QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC); + +/* + * Qgroup information. + * + * Path: /sys/fs/btrfs//qgroups/_/ + */ +static struct attribute *qgroup_attrs[] = { + BTRFS_ATTR_PTR(qgroup, referenced), + BTRFS_ATTR_PTR(qgroup, exclusive), + BTRFS_ATTR_PTR(qgroup, max_referenced), + BTRFS_ATTR_PTR(qgroup, max_exclusive), + BTRFS_ATTR_PTR(qgroup, limit_flags), + BTRFS_ATTR_PTR(qgroup, rsv_data), + BTRFS_ATTR_PTR(qgroup, rsv_meta_pertrans), + BTRFS_ATTR_PTR(qgroup, rsv_meta_prealloc), + NULL +}; +ATTRIBUTE_GROUPS(qgroup); + +static void qgroup_release(struct kobject *kobj) +{ + struct btrfs_qgroup *qgroup = container_of(kobj, struct btrfs_qgroup, kobj); + + memset(&qgroup->kobj, 0, sizeof(*kobj)); +} + +static const struct kobj_type qgroup_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = qgroup_release, + .default_groups = qgroup_groups, +}; + +int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) +{ + struct kobject *qgroups_kobj = fs_info->qgroups_kobj; + int ret; + + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + return 0; + if (qgroup->kobj.state_initialized) + return 0; + if (!qgroups_kobj) + return -EINVAL; + + ret = kobject_init_and_add(&qgroup->kobj, &qgroup_ktype, qgroups_kobj, + "%hu_%llu", btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid)); + if (ret < 0) + kobject_put(&qgroup->kobj); + + return ret; +} + +void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info) +{ + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup *next; + + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + return; + + rbtree_postorder_for_each_entry_safe(qgroup, next, + &fs_info->qgroup_tree, node) + btrfs_sysfs_del_one_qgroup(fs_info, qgroup); + if (fs_info->qgroups_kobj) { + kobject_del(fs_info->qgroups_kobj); + kobject_put(fs_info->qgroups_kobj); + fs_info->qgroups_kobj = NULL; + } +} + +/* Called when qgroups get initialized, thus there is no need for locking */ +int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info) +{ + struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup *next; + int ret = 0; + + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + return 0; + + ASSERT(fsid_kobj); + if (fs_info->qgroups_kobj) + return 0; + + fs_info->qgroups_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!fs_info->qgroups_kobj) + return -ENOMEM; + + ret = kobject_init_and_add(fs_info->qgroups_kobj, &qgroups_ktype, + fsid_kobj, "qgroups"); + if (ret < 0) + goto out; + + rbtree_postorder_for_each_entry_safe(qgroup, next, + &fs_info->qgroup_tree, node) { + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); + if (ret < 0) + goto out; + } + +out: + if (ret < 0) + btrfs_sysfs_del_qgroups(fs_info); + return ret; +} + +void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) +{ + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + return; + + if (qgroup->kobj.state_initialized) { + kobject_del(&qgroup->kobj); + kobject_put(&qgroup->kobj); + } +} + +/* + * Change per-fs features in /sys/fs/btrfs/UUID/features to match current + * values in superblock. Call after any changes to incompat/compat_ro flags + */ +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info) +{ + struct kobject *fsid_kobj; + int ret; + + if (!fs_info) + return; + + fsid_kobj = &fs_info->fs_devices->fsid_kobj; + if (!fsid_kobj->state_initialized) + return; + + ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group); + if (ret < 0) + btrfs_warn(fs_info, + "failed to update /sys/fs/btrfs/%pU/features: %d", + fs_info->fs_devices->fsid, ret); +} + +int __init btrfs_init_sysfs(void) +{ + int ret; + + btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); + if (!btrfs_kset) + return -ENOMEM; + + init_feature_attrs(); + ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); + if (ret) + goto out2; + ret = sysfs_merge_group(&btrfs_kset->kobj, + &btrfs_static_feature_attr_group); + if (ret) + goto out_remove_group; + +#ifdef CONFIG_BTRFS_DEBUG + ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); + if (ret) { + sysfs_unmerge_group(&btrfs_kset->kobj, + &btrfs_static_feature_attr_group); + goto out_remove_group; + } +#endif + + return 0; + +out_remove_group: + sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); +out2: + kset_unregister(btrfs_kset); + + return ret; +} + +void __cold btrfs_exit_sysfs(void) +{ + sysfs_unmerge_group(&btrfs_kset->kobj, + &btrfs_static_feature_attr_group); + sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); +#ifdef CONFIG_BTRFS_DEBUG + sysfs_remove_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); +#endif + kset_unregister(btrfs_kset); +} diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h new file mode 100644 index 0000000000..86c7eef128 --- /dev/null +++ b/fs/btrfs/sysfs.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SYSFS_H +#define BTRFS_SYSFS_H + +#include + +enum btrfs_feature_set { + FEAT_COMPAT, + FEAT_COMPAT_RO, + FEAT_INCOMPAT, + FEAT_MAX +}; + +char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); +const char *btrfs_feature_set_name(enum btrfs_feature_set set); +int btrfs_sysfs_add_device(struct btrfs_device *device); +void btrfs_sysfs_remove_device(struct btrfs_device *device); +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); +void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info); +void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action); + +int __init btrfs_init_sysfs(void); +void __cold btrfs_exit_sysfs(void); +int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache); +int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info); +void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info); +void btrfs_sysfs_update_devid(struct btrfs_device *device); + +int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup); +void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info); +int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup); + +#endif diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c new file mode 100644 index 0000000000..ca09cf9afc --- /dev/null +++ b/fs/btrfs/tests/btrfs-tests.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + */ + +#include +#include +#include +#include +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../free-space-cache.h" +#include "../free-space-tree.h" +#include "../transaction.h" +#include "../volumes.h" +#include "../disk-io.h" +#include "../qgroup.h" +#include "../block-group.h" +#include "../fs.h" + +static struct vfsmount *test_mnt = NULL; + +const char *test_error[] = { + [TEST_ALLOC_FS_INFO] = "cannot allocate fs_info", + [TEST_ALLOC_ROOT] = "cannot allocate root", + [TEST_ALLOC_EXTENT_BUFFER] = "cannot extent buffer", + [TEST_ALLOC_PATH] = "cannot allocate path", + [TEST_ALLOC_INODE] = "cannot allocate inode", + [TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group", + [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map", +}; + +static const struct super_operations btrfs_test_super_ops = { + .alloc_inode = btrfs_alloc_inode, + .destroy_inode = btrfs_test_destroy_inode, +}; + + +static int btrfs_test_init_fs_context(struct fs_context *fc) +{ + struct pseudo_fs_context *ctx = init_pseudo(fc, BTRFS_TEST_MAGIC); + if (!ctx) + return -ENOMEM; + ctx->ops = &btrfs_test_super_ops; + return 0; +} + +static struct file_system_type test_type = { + .name = "btrfs_test_fs", + .init_fs_context = btrfs_test_init_fs_context, + .kill_sb = kill_anon_super, +}; + +struct inode *btrfs_new_test_inode(void) +{ + struct inode *inode; + + inode = new_inode(test_mnt->mnt_sb); + if (!inode) + return NULL; + + inode->i_mode = S_IFREG; + inode->i_ino = BTRFS_FIRST_FREE_OBJECTID; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; + BTRFS_I(inode)->location.offset = 0; + inode_init_owner(&nop_mnt_idmap, inode, NULL, S_IFREG); + + return inode; +} + +static int btrfs_init_test_fs(void) +{ + int ret; + + ret = register_filesystem(&test_type); + if (ret) { + printk(KERN_ERR "btrfs: cannot register test file system\n"); + return ret; + } + + test_mnt = kern_mount(&test_type); + if (IS_ERR(test_mnt)) { + printk(KERN_ERR "btrfs: cannot mount test file system\n"); + unregister_filesystem(&test_type); + return PTR_ERR(test_mnt); + } + return 0; +} + +static void btrfs_destroy_test_fs(void) +{ + kern_unmount(test_mnt); + unregister_filesystem(&test_type); +} + +struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) +{ + struct btrfs_device *dev; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return ERR_PTR(-ENOMEM); + + extent_io_tree_init(NULL, &dev->alloc_state, 0); + INIT_LIST_HEAD(&dev->dev_list); + list_add(&dev->dev_list, &fs_info->fs_devices->devices); + + return dev; +} + +static void btrfs_free_dummy_device(struct btrfs_device *dev) +{ + extent_io_tree_release(&dev->alloc_state); + kfree(dev); +} + +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) +{ + struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), + GFP_KERNEL); + + if (!fs_info) + return fs_info; + fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices), + GFP_KERNEL); + if (!fs_info->fs_devices) { + kfree(fs_info); + return NULL; + } + INIT_LIST_HEAD(&fs_info->fs_devices->devices); + + fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block), + GFP_KERNEL); + if (!fs_info->super_copy) { + kfree(fs_info->fs_devices); + kfree(fs_info); + return NULL; + } + + btrfs_init_fs_info(fs_info); + + fs_info->nodesize = nodesize; + fs_info->sectorsize = sectorsize; + fs_info->sectorsize_bits = ilog2(sectorsize); + set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + + test_mnt->mnt_sb->s_fs_info = fs_info; + + return fs_info; +} + +void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) +{ + struct radix_tree_iter iter; + void **slot; + struct btrfs_device *dev, *tmp; + + if (!fs_info) + return; + + if (WARN_ON(!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, + &fs_info->fs_state))) + return; + + test_mnt->mnt_sb->s_fs_info = NULL; + + spin_lock(&fs_info->buffer_lock); + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { + struct extent_buffer *eb; + + eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); + if (!eb) + continue; + /* Shouldn't happen but that kind of thinking creates CVE's */ + if (radix_tree_exception(eb)) { + if (radix_tree_deref_retry(eb)) + slot = radix_tree_iter_retry(&iter); + continue; + } + slot = radix_tree_iter_resume(slot, &iter); + spin_unlock(&fs_info->buffer_lock); + free_extent_buffer_stale(eb); + spin_lock(&fs_info->buffer_lock); + } + spin_unlock(&fs_info->buffer_lock); + + btrfs_mapping_tree_free(&fs_info->mapping_tree); + list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, + dev_list) { + btrfs_free_dummy_device(dev); + } + btrfs_free_qgroup_config(fs_info); + btrfs_free_fs_roots(fs_info); + kfree(fs_info->super_copy); + btrfs_check_leaked_roots(fs_info); + btrfs_extent_buffer_leak_debug_check(fs_info); + kfree(fs_info->fs_devices); + kfree(fs_info); +} + +void btrfs_free_dummy_root(struct btrfs_root *root) +{ + if (IS_ERR_OR_NULL(root)) + return; + /* Will be freed by btrfs_free_fs_roots */ + if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) + return; + btrfs_global_root_delete(root); + btrfs_put_root(root); +} + +struct btrfs_block_group * +btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, + unsigned long length) +{ + struct btrfs_block_group *cache; + + cache = kzalloc(sizeof(*cache), GFP_KERNEL); + if (!cache) + return NULL; + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_KERNEL); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->start = 0; + cache->length = length; + cache->full_stripe_len = fs_info->sectorsize; + cache->fs_info = fs_info; + + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->bg_list); + btrfs_init_free_space_ctl(cache, cache->free_space_ctl); + mutex_init(&cache->free_space_lock); + + return cache; +} + +void btrfs_free_dummy_block_group(struct btrfs_block_group *cache) +{ + if (!cache) + return; + btrfs_remove_free_space_cache(cache); + kfree(cache->free_space_ctl); + kfree(cache); +} + +void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + memset(trans, 0, sizeof(*trans)); + trans->transid = 1; + trans->type = __TRANS_DUMMY; + trans->fs_info = fs_info; +} + +int btrfs_run_sanity_tests(void) +{ + int ret, i; + u32 sectorsize, nodesize; + u32 test_sectorsize[] = { + PAGE_SIZE, + }; + ret = btrfs_init_test_fs(); + if (ret) + return ret; + for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) { + sectorsize = test_sectorsize[i]; + for (nodesize = sectorsize; + nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE; + nodesize <<= 1) { + pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n", + sectorsize, nodesize); + ret = btrfs_test_free_space_cache(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_buffer_operations(sectorsize, + nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_io(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_inodes(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_qgroups(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_free_space_tree(sectorsize, nodesize); + if (ret) + goto out; + } + } + ret = btrfs_test_extent_map(); + +out: + btrfs_destroy_test_fs(); + return ret; +} diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h new file mode 100644 index 0000000000..7a2d7ffbe3 --- /dev/null +++ b/fs/btrfs/tests/btrfs-tests.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + */ + +#ifndef BTRFS_TESTS_H +#define BTRFS_TESTS_H + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_run_sanity_tests(void); + +#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__) +#define test_err(fmt, ...) pr_err("BTRFS: selftest: %s:%d " fmt "\n", \ + __FILE__, __LINE__, ##__VA_ARGS__) + +#define test_std_err(index) test_err("%s", test_error[index]) + +enum { + TEST_ALLOC_FS_INFO, + TEST_ALLOC_ROOT, + TEST_ALLOC_EXTENT_BUFFER, + TEST_ALLOC_PATH, + TEST_ALLOC_INODE, + TEST_ALLOC_BLOCK_GROUP, + TEST_ALLOC_EXTENT_MAP, +}; + +extern const char *test_error[]; + +struct btrfs_root; +struct btrfs_trans_handle; + +int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize); +int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); +int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); +int btrfs_test_inodes(u32 sectorsize, u32 nodesize); +int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); +int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); +int btrfs_test_extent_map(void); +struct inode *btrfs_new_test_inode(void); +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); +void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); +void btrfs_free_dummy_root(struct btrfs_root *root); +struct btrfs_block_group * +btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length); +void btrfs_free_dummy_block_group(struct btrfs_block_group *cache); +void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info); +#else +static inline int btrfs_run_sanity_tests(void) +{ + return 0; +} +#endif + +#endif diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c new file mode 100644 index 0000000000..6a43a64ba5 --- /dev/null +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + */ + +#include +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../extent_io.h" +#include "../disk-io.h" +#include "../accessors.h" + +static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_path *path = NULL; + struct btrfs_root *root = NULL; + struct extent_buffer *eb; + char *value = "mary had a little lamb"; + char *split1 = "mary had a little"; + char *split2 = " lamb"; + char *split3 = "mary"; + char *split4 = " had a little"; + char buf[32]; + struct btrfs_key key; + u32 value_len = strlen(value); + int ret = 0; + + test_msg("running btrfs_split_item tests"); + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(root); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + test_std_err(TEST_ALLOC_PATH); + ret = -ENOMEM; + goto out; + } + + eb = alloc_dummy_extent_buffer(fs_info, nodesize); + path->nodes[0] = eb; + if (!eb) { + test_std_err(TEST_ALLOC_EXTENT_BUFFER); + ret = -ENOMEM; + goto out; + } + path->slots[0] = 0; + + key.objectid = 0; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = 0; + + /* + * Passing a NULL trans handle is fine here, we have a dummy root eb + * and the tree is a single node (level 0). + */ + btrfs_setup_item_for_insert(NULL, root, path, &key, value_len); + write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), + value_len); + + key.offset = 3; + + /* + * Passing NULL trans here should be safe because we have plenty of + * space in this leaf to split the item without having to split the + * leaf. + */ + ret = btrfs_split_item(NULL, root, path, &key, 17); + if (ret) { + test_err("split item failed %d", ret); + goto out; + } + + /* + * Read the first slot, it should have the original key and contain only + * 'mary had a little' + */ + btrfs_item_key_to_cpu(eb, &key, 0); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 0) { + test_err("invalid key at slot 0"); + ret = -EINVAL; + goto out; + } + + if (btrfs_item_size(eb, 0) != strlen(split1)) { + test_err("invalid len in the first split"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0), + strlen(split1)); + if (memcmp(buf, split1, strlen(split1))) { + test_err( +"data in the buffer doesn't match what it should in the first split have='%.*s' want '%s'", + (int)strlen(split1), buf, split1); + ret = -EINVAL; + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 1); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 3) { + test_err("invalid key at slot 1"); + ret = -EINVAL; + goto out; + } + + if (btrfs_item_size(eb, 1) != strlen(split2)) { + test_err("invalid len in the second split"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1), + strlen(split2)); + if (memcmp(buf, split2, strlen(split2))) { + test_err( + "data in the buffer doesn't match what it should in the second split"); + ret = -EINVAL; + goto out; + } + + key.offset = 1; + /* Do it again so we test memmoving the other items in the leaf */ + ret = btrfs_split_item(NULL, root, path, &key, 4); + if (ret) { + test_err("second split item failed %d", ret); + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 0); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 0) { + test_err("invalid key at slot 0"); + ret = -EINVAL; + goto out; + } + + if (btrfs_item_size(eb, 0) != strlen(split3)) { + test_err("invalid len in the first split"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0), + strlen(split3)); + if (memcmp(buf, split3, strlen(split3))) { + test_err( + "data in the buffer doesn't match what it should in the third split"); + ret = -EINVAL; + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 1); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 1) { + test_err("invalid key at slot 1"); + ret = -EINVAL; + goto out; + } + + if (btrfs_item_size(eb, 1) != strlen(split4)) { + test_err("invalid len in the second split"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1), + strlen(split4)); + if (memcmp(buf, split4, strlen(split4))) { + test_err( + "data in the buffer doesn't match what it should in the fourth split"); + ret = -EINVAL; + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 2); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 3) { + test_err("invalid key at slot 2"); + ret = -EINVAL; + goto out; + } + + if (btrfs_item_size(eb, 2) != strlen(split2)) { + test_err("invalid len in the second split"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2), + strlen(split2)); + if (memcmp(buf, split2, strlen(split2))) { + test_err( + "data in the buffer doesn't match what it should in the last chunk"); + ret = -EINVAL; + goto out; + } +out: + btrfs_free_path(path); + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); + return ret; +} + +int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize) +{ + test_msg("running extent buffer operation tests"); + return test_btrfs_split_item(sectorsize, nodesize); +} diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c new file mode 100644 index 0000000000..1cc86af97d --- /dev/null +++ b/fs/btrfs/tests/extent-io-tests.c @@ -0,0 +1,812 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../extent_io.h" +#include "../btrfs_inode.h" + +#define PROCESS_UNLOCK (1 << 0) +#define PROCESS_RELEASE (1 << 1) +#define PROCESS_TEST_LOCKED (1 << 2) + +static noinline int process_page_range(struct inode *inode, u64 start, u64 end, + unsigned long flags) +{ + int ret; + struct folio_batch fbatch; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; + int i; + int count = 0; + int loops = 0; + + folio_batch_init(&fbatch); + + while (index <= end_index) { + ret = filemap_get_folios_contig(inode->i_mapping, &index, + end_index, &fbatch); + for (i = 0; i < ret; i++) { + struct folio *folio = fbatch.folios[i]; + + if (flags & PROCESS_TEST_LOCKED && + !folio_test_locked(folio)) + count++; + if (flags & PROCESS_UNLOCK && folio_test_locked(folio)) + folio_unlock(folio); + if (flags & PROCESS_RELEASE) + folio_put(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + loops++; + if (loops > 100000) { + printk(KERN_ERR + "stuck in a loop, start %llu, end %llu, ret %d\n", + start, end, ret); + break; + } + } + + return count; +} + +#define STATE_FLAG_STR_LEN 256 + +#define PRINT_ONE_FLAG(state, dest, cur, name) \ +({ \ + if (state->state & EXTENT_##name) \ + cur += scnprintf(dest + cur, STATE_FLAG_STR_LEN - cur, \ + "%s" #name, cur == 0 ? "" : "|"); \ +}) + +static void extent_flag_to_str(const struct extent_state *state, char *dest) +{ + int cur = 0; + + dest[0] = 0; + PRINT_ONE_FLAG(state, dest, cur, DIRTY); + PRINT_ONE_FLAG(state, dest, cur, UPTODATE); + PRINT_ONE_FLAG(state, dest, cur, LOCKED); + PRINT_ONE_FLAG(state, dest, cur, NEW); + PRINT_ONE_FLAG(state, dest, cur, DELALLOC); + PRINT_ONE_FLAG(state, dest, cur, DEFRAG); + PRINT_ONE_FLAG(state, dest, cur, BOUNDARY); + PRINT_ONE_FLAG(state, dest, cur, NODATASUM); + PRINT_ONE_FLAG(state, dest, cur, CLEAR_META_RESV); + PRINT_ONE_FLAG(state, dest, cur, NEED_WAIT); + PRINT_ONE_FLAG(state, dest, cur, NORESERVE); + PRINT_ONE_FLAG(state, dest, cur, QGROUP_RESERVED); + PRINT_ONE_FLAG(state, dest, cur, CLEAR_DATA_RESV); +} + +static void dump_extent_io_tree(const struct extent_io_tree *tree) +{ + struct rb_node *node; + char flags_str[STATE_FLAG_STR_LEN]; + + node = rb_first(&tree->state); + test_msg("io tree content:"); + while (node) { + struct extent_state *state; + + state = rb_entry(node, struct extent_state, rb_node); + extent_flag_to_str(state, flags_str); + test_msg(" start=%llu len=%llu flags=%s", state->start, + state->end + 1 - state->start, flags_str); + node = rb_next(node); + } +} + +static int test_find_delalloc(u32 sectorsize) +{ + struct inode *inode; + struct extent_io_tree *tmp; + struct page *page; + struct page *locked_page = NULL; + unsigned long index = 0; + /* In this test we need at least 2 file extents at its maximum size */ + u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; + u64 total_dirty = 2 * max_bytes; + u64 start, end, test_start; + bool found; + int ret = -EINVAL; + + test_msg("running find delalloc tests"); + + inode = btrfs_new_test_inode(); + if (!inode) { + test_std_err(TEST_ALLOC_INODE); + return -ENOMEM; + } + tmp = &BTRFS_I(inode)->io_tree; + + /* + * Passing NULL as we don't have fs_info but tracepoints are not used + * at this point + */ + extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST); + + /* + * First go through and create and mark all of our pages dirty, we pin + * everything to make sure our pages don't get evicted and screw up our + * test. + */ + for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) { + page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); + if (!page) { + test_err("failed to allocate test page"); + ret = -ENOMEM; + goto out; + } + SetPageDirty(page); + if (index) { + unlock_page(page); + } else { + get_page(page); + locked_page = page; + } + } + + /* Test this scenario + * |--- delalloc ---| + * |--- search ---| + */ + set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); + start = 0; + end = start + PAGE_SIZE - 1; + found = find_lock_delalloc_range(inode, locked_page, &start, + &end); + if (!found) { + test_err("should have found at least one delalloc"); + goto out_bits; + } + if (start != 0 || end != (sectorsize - 1)) { + test_err("expected start 0 end %u, got start %llu end %llu", + sectorsize - 1, start, end); + goto out_bits; + } + unlock_extent(tmp, start, end, NULL); + unlock_page(locked_page); + put_page(locked_page); + + /* + * Test this scenario + * + * |--- delalloc ---| + * |--- search ---| + */ + test_start = SZ_64M; + locked_page = find_lock_page(inode->i_mapping, + test_start >> PAGE_SHIFT); + if (!locked_page) { + test_err("couldn't find the locked page"); + goto out_bits; + } + set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); + start = test_start; + end = start + PAGE_SIZE - 1; + found = find_lock_delalloc_range(inode, locked_page, &start, + &end); + if (!found) { + test_err("couldn't find delalloc in our range"); + goto out_bits; + } + if (start != test_start || end != max_bytes - 1) { + test_err("expected start %llu end %llu, got start %llu, end %llu", + test_start, max_bytes - 1, start, end); + goto out_bits; + } + if (process_page_range(inode, start, end, + PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) { + test_err("there were unlocked pages in the range"); + goto out_bits; + } + unlock_extent(tmp, start, end, NULL); + /* locked_page was unlocked above */ + put_page(locked_page); + + /* + * Test this scenario + * |--- delalloc ---| + * |--- search ---| + */ + test_start = max_bytes + sectorsize; + locked_page = find_lock_page(inode->i_mapping, test_start >> + PAGE_SHIFT); + if (!locked_page) { + test_err("couldn't find the locked page"); + goto out_bits; + } + start = test_start; + end = start + PAGE_SIZE - 1; + found = find_lock_delalloc_range(inode, locked_page, &start, + &end); + if (found) { + test_err("found range when we shouldn't have"); + goto out_bits; + } + if (end != test_start + PAGE_SIZE - 1) { + test_err("did not return the proper end offset"); + goto out_bits; + } + + /* + * Test this scenario + * [------- delalloc -------| + * [max_bytes]|-- search--| + * + * We are re-using our test_start from above since it works out well. + */ + set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); + start = test_start; + end = start + PAGE_SIZE - 1; + found = find_lock_delalloc_range(inode, locked_page, &start, + &end); + if (!found) { + test_err("didn't find our range"); + goto out_bits; + } + if (start != test_start || end != total_dirty - 1) { + test_err("expected start %llu end %llu, got start %llu end %llu", + test_start, total_dirty - 1, start, end); + goto out_bits; + } + if (process_page_range(inode, start, end, + PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) { + test_err("pages in range were not all locked"); + goto out_bits; + } + unlock_extent(tmp, start, end, NULL); + + /* + * Now to test where we run into a page that is no longer dirty in the + * range we want to find. + */ + page = find_get_page(inode->i_mapping, + (max_bytes + SZ_1M) >> PAGE_SHIFT); + if (!page) { + test_err("couldn't find our page"); + goto out_bits; + } + ClearPageDirty(page); + put_page(page); + + /* We unlocked it in the previous test */ + lock_page(locked_page); + start = test_start; + end = start + PAGE_SIZE - 1; + /* + * Currently if we fail to find dirty pages in the delalloc range we + * will adjust max_bytes down to PAGE_SIZE and then re-search. If + * this changes at any point in the future we will need to fix this + * tests expected behavior. + */ + found = find_lock_delalloc_range(inode, locked_page, &start, + &end); + if (!found) { + test_err("didn't find our range"); + goto out_bits; + } + if (start != test_start && end != test_start + PAGE_SIZE - 1) { + test_err("expected start %llu end %llu, got start %llu end %llu", + test_start, test_start + PAGE_SIZE - 1, start, end); + goto out_bits; + } + if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED | + PROCESS_UNLOCK)) { + test_err("pages in range were not all locked"); + goto out_bits; + } + ret = 0; +out_bits: + if (ret) + dump_extent_io_tree(tmp); + clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); +out: + if (locked_page) + put_page(locked_page); + process_page_range(inode, 0, total_dirty - 1, + PROCESS_UNLOCK | PROCESS_RELEASE); + iput(inode); + return ret; +} + +static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb) +{ + unsigned long i; + + for (i = 0; i < eb->len * BITS_PER_BYTE; i++) { + int bit, bit1; + + bit = !!test_bit(i, bitmap); + bit1 = !!extent_buffer_test_bit(eb, 0, i); + if (bit1 != bit) { + u8 has; + u8 expect; + + read_extent_buffer(eb, &has, i / BITS_PER_BYTE, 1); + expect = bitmap_get_value8(bitmap, ALIGN(i, BITS_PER_BYTE)); + + test_err( + "bits do not match, start byte 0 bit %lu, byte %lu has 0x%02x expect 0x%02x", + i, i / BITS_PER_BYTE, has, expect); + return -EINVAL; + } + + bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, + i % BITS_PER_BYTE); + if (bit1 != bit) { + u8 has; + u8 expect; + + read_extent_buffer(eb, &has, i / BITS_PER_BYTE, 1); + expect = bitmap_get_value8(bitmap, ALIGN(i, BITS_PER_BYTE)); + + test_err( + "bits do not match, start byte %lu bit %lu, byte %lu has 0x%02x expect 0x%02x", + i / BITS_PER_BYTE, i % BITS_PER_BYTE, + i / BITS_PER_BYTE, has, expect); + return -EINVAL; + } + } + return 0; +} + +static int test_bitmap_set(const char *name, unsigned long *bitmap, + struct extent_buffer *eb, + unsigned long byte_start, unsigned long bit_start, + unsigned long bit_len) +{ + int ret; + + bitmap_set(bitmap, byte_start * BITS_PER_BYTE + bit_start, bit_len); + extent_buffer_bitmap_set(eb, byte_start, bit_start, bit_len); + ret = check_eb_bitmap(bitmap, eb); + if (ret < 0) + test_err("%s test failed", name); + return ret; +} + +static int test_bitmap_clear(const char *name, unsigned long *bitmap, + struct extent_buffer *eb, + unsigned long byte_start, unsigned long bit_start, + unsigned long bit_len) +{ + int ret; + + bitmap_clear(bitmap, byte_start * BITS_PER_BYTE + bit_start, bit_len); + extent_buffer_bitmap_clear(eb, byte_start, bit_start, bit_len); + ret = check_eb_bitmap(bitmap, eb); + if (ret < 0) + test_err("%s test failed", name); + return ret; +} +static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb) +{ + unsigned long i, j; + unsigned long byte_len = eb->len; + u32 x; + int ret; + + ret = test_bitmap_clear("clear all run 1", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; + + ret = test_bitmap_set("set all", bitmap, eb, 0, 0, byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("clear all run 2", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; + + ret = test_bitmap_set("same byte set", bitmap, eb, 0, 2, 4); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("same byte partial clear", bitmap, eb, 0, 4, 1); + if (ret < 0) + return ret; + + ret = test_bitmap_set("cross byte set", bitmap, eb, 2, 4, 8); + if (ret < 0) + return ret; + + ret = test_bitmap_set("cross multi byte set", bitmap, eb, 4, 4, 24); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("cross byte clear", bitmap, eb, 2, 6, 4); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("cross multi byte clear", bitmap, eb, 4, 6, 20); + if (ret < 0) + return ret; + + /* Straddling pages test */ + if (byte_len > PAGE_SIZE) { + ret = test_bitmap_set("cross page set", bitmap, eb, + PAGE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (ret < 0) + return ret; + + ret = test_bitmap_set("cross page set all", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("cross page clear", bitmap, eb, + PAGE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (ret < 0) + return ret; + } + + /* + * Generate a wonky pseudo-random bit pattern for the sake of not using + * something repetitive that could miss some hypothetical off-by-n bug. + */ + x = 0; + ret = test_bitmap_clear("clear all run 3", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; + + for (i = 0; i < byte_len * BITS_PER_BYTE / 32; i++) { + x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffU; + for (j = 0; j < 32; j++) { + if (x & (1U << j)) { + bitmap_set(bitmap, i * 32 + j, 1); + extent_buffer_bitmap_set(eb, 0, i * 32 + j, 1); + } + } + } + + ret = check_eb_bitmap(bitmap, eb); + if (ret) { + test_err("random bit pattern failed"); + return ret; + } + + return 0; +} + +static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) +{ + struct btrfs_fs_info *fs_info; + unsigned long *bitmap = NULL; + struct extent_buffer *eb = NULL; + int ret; + + test_msg("running extent buffer bitmap tests"); + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + bitmap = kmalloc(nodesize, GFP_KERNEL); + if (!bitmap) { + test_err("couldn't allocate test bitmap"); + ret = -ENOMEM; + goto out; + } + + eb = __alloc_dummy_extent_buffer(fs_info, 0, nodesize); + if (!eb) { + test_std_err(TEST_ALLOC_ROOT); + ret = -ENOMEM; + goto out; + } + + ret = __test_eb_bitmaps(bitmap, eb); + if (ret) + goto out; + + free_extent_buffer(eb); + + /* + * Test again for case where the tree block is sectorsize aligned but + * not nodesize aligned. + */ + eb = __alloc_dummy_extent_buffer(fs_info, sectorsize, nodesize); + if (!eb) { + test_std_err(TEST_ALLOC_ROOT); + ret = -ENOMEM; + goto out; + } + + ret = __test_eb_bitmaps(bitmap, eb); +out: + free_extent_buffer(eb); + kfree(bitmap); + btrfs_free_dummy_fs_info(fs_info); + return ret; +} + +static int test_find_first_clear_extent_bit(void) +{ + struct extent_io_tree tree; + u64 start, end; + int ret = -EINVAL; + + test_msg("running find_first_clear_extent_bit test"); + + extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST); + + /* Test correct handling of empty tree */ + find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); + if (start != 0 || end != -1) { + test_err( + "error getting a range from completely empty tree: start %llu end %llu", + start, end); + goto out; + } + /* + * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between + * 4M-32M + */ + set_extent_bit(&tree, SZ_1M, SZ_4M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); + + find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + if (start != 0 || end != SZ_1M - 1) { + test_err("error finding beginning range: start %llu end %llu", + start, end); + goto out; + } + + /* Now add 32M-64M so that we have a hole between 4M-32M */ + set_extent_bit(&tree, SZ_32M, SZ_64M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); + + /* + * Request first hole starting at 12M, we should get 4M-32M + */ + find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + if (start != SZ_4M || end != SZ_32M - 1) { + test_err("error finding trimmed range: start %llu end %llu", + start, end); + goto out; + } + + /* + * Search in the middle of allocated range, should get the next one + * available, which happens to be unallocated -> 4M-32M + */ + find_first_clear_extent_bit(&tree, SZ_2M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + if (start != SZ_4M || end != SZ_32M - 1) { + test_err("error finding next unalloc range: start %llu end %llu", + start, end); + goto out; + } + + /* + * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag + * being unset in this range, we should get the entry in range 64M-72M + */ + set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL); + find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, + CHUNK_TRIMMED); + + if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) { + test_err("error finding exact range: start %llu end %llu", + start, end); + goto out; + } + + find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end, + CHUNK_TRIMMED); + + /* + * Search in the middle of set range whose immediate neighbour doesn't + * have the bits set so it must be returned + */ + if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) { + test_err("error finding next alloc range: start %llu end %llu", + start, end); + goto out; + } + + /* + * Search beyond any known range, shall return after last known range + * and end should be -1 + */ + find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED); + if (start != SZ_64M + SZ_8M || end != -1) { + test_err( + "error handling beyond end of range search: start %llu end %llu", + start, end); + goto out; + } + + ret = 0; +out: + if (ret) + dump_extent_io_tree(&tree); + clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED); + + return ret; +} + +static void dump_eb_and_memory_contents(struct extent_buffer *eb, void *memory, + const char *test_name) +{ + for (int i = 0; i < eb->len; i++) { + struct page *page = eb->pages[i >> PAGE_SHIFT]; + void *addr = page_address(page) + offset_in_page(i); + + if (memcmp(addr, memory + i, 1) != 0) { + test_err("%s failed", test_name); + test_err("eb and memory diffs at byte %u, eb has 0x%02x memory has 0x%02x", + i, *(u8 *)addr, *(u8 *)(memory + i)); + return; + } + } +} + +static int verify_eb_and_memory(struct extent_buffer *eb, void *memory, + const char *test_name) +{ + for (int i = 0; i < (eb->len >> PAGE_SHIFT); i++) { + void *eb_addr = page_address(eb->pages[i]); + + if (memcmp(memory + (i << PAGE_SHIFT), eb_addr, PAGE_SIZE) != 0) { + dump_eb_and_memory_contents(eb, memory, test_name); + return -EUCLEAN; + } + } + return 0; +} + +/* + * Init both memory and extent buffer contents to the same randomly generated + * contents. + */ +static void init_eb_and_memory(struct extent_buffer *eb, void *memory) +{ + get_random_bytes(memory, eb->len); + write_extent_buffer(eb, memory, 0, eb->len); +} + +static int test_eb_mem_ops(u32 sectorsize, u32 nodesize) +{ + struct btrfs_fs_info *fs_info; + struct extent_buffer *eb = NULL; + void *memory = NULL; + int ret; + + test_msg("running extent buffer memory operation tests"); + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + memory = kvzalloc(nodesize, GFP_KERNEL); + if (!memory) { + test_err("failed to allocate memory"); + ret = -ENOMEM; + goto out; + } + + eb = __alloc_dummy_extent_buffer(fs_info, SZ_1M, nodesize); + if (!eb) { + test_std_err(TEST_ALLOC_EXTENT_BUFFER); + ret = -ENOMEM; + goto out; + } + + init_eb_and_memory(eb, memory); + ret = verify_eb_and_memory(eb, memory, "full eb write"); + if (ret < 0) + goto out; + + memcpy(memory, memory + 16, 16); + memcpy_extent_buffer(eb, 0, 16, 16); + ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 1"); + if (ret < 0) + goto out; + + memcpy(memory, memory + 2048, 16); + memcpy_extent_buffer(eb, 0, 2048, 16); + ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 2"); + if (ret < 0) + goto out; + memcpy(memory, memory + 2048, 2048); + memcpy_extent_buffer(eb, 0, 2048, 2048); + ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 3"); + if (ret < 0) + goto out; + + memmove(memory + 512, memory + 256, 512); + memmove_extent_buffer(eb, 512, 256, 512); + ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 1"); + if (ret < 0) + goto out; + + memmove(memory + 2048, memory + 512, 2048); + memmove_extent_buffer(eb, 2048, 512, 2048); + ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 2"); + if (ret < 0) + goto out; + memmove(memory + 512, memory + 2048, 2048); + memmove_extent_buffer(eb, 512, 2048, 2048); + ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 3"); + if (ret < 0) + goto out; + + if (nodesize > PAGE_SIZE) { + memcpy(memory, memory + 4096 - 128, 256); + memcpy_extent_buffer(eb, 0, 4096 - 128, 256); + ret = verify_eb_and_memory(eb, memory, "cross page non-overlapping memcpy 1"); + if (ret < 0) + goto out; + + memcpy(memory + 4096 - 128, memory + 4096 + 128, 256); + memcpy_extent_buffer(eb, 4096 - 128, 4096 + 128, 256); + ret = verify_eb_and_memory(eb, memory, "cross page non-overlapping memcpy 2"); + if (ret < 0) + goto out; + + memmove(memory + 4096 - 128, memory + 4096 - 64, 256); + memmove_extent_buffer(eb, 4096 - 128, 4096 - 64, 256); + ret = verify_eb_and_memory(eb, memory, "cross page overlapping memcpy 1"); + if (ret < 0) + goto out; + + memmove(memory + 4096 - 64, memory + 4096 - 128, 256); + memmove_extent_buffer(eb, 4096 - 64, 4096 - 128, 256); + ret = verify_eb_and_memory(eb, memory, "cross page overlapping memcpy 2"); + if (ret < 0) + goto out; + } +out: + free_extent_buffer(eb); + kvfree(memory); + btrfs_free_dummy_fs_info(fs_info); + return ret; +} + +int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) +{ + int ret; + + test_msg("running extent I/O tests"); + + ret = test_find_delalloc(sectorsize); + if (ret) + goto out; + + ret = test_find_first_clear_extent_bit(); + if (ret) + goto out; + + ret = test_eb_bitmaps(sectorsize, nodesize); + if (ret) + goto out; + + ret = test_eb_mem_ops(sectorsize, nodesize); +out: + return ret; +} diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c new file mode 100644 index 0000000000..29bdd08b24 --- /dev/null +++ b/fs/btrfs/tests/extent-map-tests.c @@ -0,0 +1,1047 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2017 Oracle. All rights reserved. + */ + +#include +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../btrfs_inode.h" +#include "../volumes.h" +#include "../disk-io.h" +#include "../block-group.h" + +static void free_extent_map_tree(struct extent_map_tree *em_tree) +{ + struct extent_map *em; + struct rb_node *node; + + write_lock(&em_tree->lock); + while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) { + node = rb_first_cached(&em_tree->map); + em = rb_entry(node, struct extent_map, rb_node); + remove_extent_mapping(em_tree, em); + +#ifdef CONFIG_BTRFS_DEBUG + if (refcount_read(&em->refs) != 1) { + test_err( +"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d", + em->start, em->len, em->block_start, + em->block_len, refcount_read(&em->refs)); + + refcount_set(&em->refs, 1); + } +#endif + free_extent_map(em); + } + write_unlock(&em_tree->lock); +} + +/* + * Test scenario: + * + * Suppose that no extent map has been loaded into memory yet, there is a file + * extent [0, 16K), followed by another file extent [16K, 20K), two dio reads + * are entering btrfs_get_extent() concurrently, t1 is reading [8K, 16K), t2 is + * reading [0, 8K) + * + * t1 t2 + * btrfs_get_extent() btrfs_get_extent() + * -> lookup_extent_mapping() ->lookup_extent_mapping() + * -> add_extent_mapping(0, 16K) + * -> return em + * ->add_extent_mapping(0, 16K) + * -> #handle -EEXIST + */ +static int test_case_1(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree) +{ + struct extent_map *em; + u64 start = 0; + u64 len = SZ_8K; + int ret; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + /* Add [0, 16K) */ + em->start = 0; + em->len = SZ_16K; + em->block_start = 0; + em->block_len = SZ_16K; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("cannot add extent range [0, 16K)"); + goto out; + } + free_extent_map(em); + + /* Add [16K, 20K) following [0, 16K) */ + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + em->start = SZ_16K; + em->len = SZ_4K; + em->block_start = SZ_32K; /* avoid merging */ + em->block_len = SZ_4K; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("cannot add extent range [16K, 20K)"); + goto out; + } + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* Add [0, 8K), should return [0, 16K) instead. */ + em->start = start; + em->len = len; + em->block_start = start; + em->block_len = len; + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + write_unlock(&em_tree->lock); + if (ret) { + test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); + goto out; + } + if (em && + (em->start != 0 || extent_map_end(em) != SZ_16K || + em->block_start != 0 || em->block_len != SZ_16K)) { + test_err( +"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", + start, start + len, ret, em->start, em->len, + em->block_start, em->block_len); + ret = -EINVAL; + } + free_extent_map(em); +out: + free_extent_map_tree(em_tree); + + return ret; +} + +/* + * Test scenario: + * + * Reading the inline ending up with EEXIST, ie. read an inline + * extent and discard page cache and read it again. + */ +static int test_case_2(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree) +{ + struct extent_map *em; + int ret; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + /* Add [0, 1K) */ + em->start = 0; + em->len = SZ_1K; + em->block_start = EXTENT_MAP_INLINE; + em->block_len = (u64)-1; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("cannot add extent range [0, 1K)"); + goto out; + } + free_extent_map(em); + + /* Add [4K, 8K) following [0, 1K) */ + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + em->start = SZ_4K; + em->len = SZ_4K; + em->block_start = SZ_4K; + em->block_len = SZ_4K; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("cannot add extent range [4K, 8K)"); + goto out; + } + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* Add [0, 1K) */ + em->start = 0; + em->len = SZ_1K; + em->block_start = EXTENT_MAP_INLINE; + em->block_len = (u64)-1; + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + write_unlock(&em_tree->lock); + if (ret) { + test_err("case2 [0 1K]: ret %d", ret); + goto out; + } + if (em && + (em->start != 0 || extent_map_end(em) != SZ_1K || + em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) { + test_err( +"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", + ret, em->start, em->len, em->block_start, + em->block_len); + ret = -EINVAL; + } + free_extent_map(em); +out: + free_extent_map_tree(em_tree); + + return ret; +} + +static int __test_case_3(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree, u64 start) +{ + struct extent_map *em; + u64 len = SZ_4K; + int ret; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + /* Add [4K, 8K) */ + em->start = SZ_4K; + em->len = SZ_4K; + em->block_start = SZ_4K; + em->block_len = SZ_4K; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("cannot add extent range [4K, 8K)"); + goto out; + } + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* Add [0, 16K) */ + em->start = 0; + em->len = SZ_16K; + em->block_start = 0; + em->block_len = SZ_16K; + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + write_unlock(&em_tree->lock); + if (ret) { + test_err("case3 [0x%llx 0x%llx): ret %d", + start, start + len, ret); + goto out; + } + /* + * Since bytes within em are contiguous, em->block_start is identical to + * em->start. + */ + if (em && + (start < em->start || start + len > extent_map_end(em) || + em->start != em->block_start || em->len != em->block_len)) { + test_err( +"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", + start, start + len, ret, em->start, em->len, + em->block_start, em->block_len); + ret = -EINVAL; + } + free_extent_map(em); +out: + free_extent_map_tree(em_tree); + + return ret; +} + +/* + * Test scenario: + * + * Suppose that no extent map has been loaded into memory yet. + * There is a file extent [0, 16K), two jobs are running concurrently + * against it, t1 is buffered writing to [4K, 8K) and t2 is doing dio + * read from [0, 4K) or [8K, 12K) or [12K, 16K). + * + * t1 goes ahead of t2 and adds em [4K, 8K) into tree. + * + * t1 t2 + * cow_file_range() btrfs_get_extent() + * -> lookup_extent_mapping() + * -> add_extent_mapping() + * -> add_extent_mapping() + */ +static int test_case_3(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree) +{ + int ret; + + ret = __test_case_3(fs_info, em_tree, 0); + if (ret) + return ret; + ret = __test_case_3(fs_info, em_tree, SZ_8K); + if (ret) + return ret; + ret = __test_case_3(fs_info, em_tree, (12 * SZ_1K)); + + return ret; +} + +static int __test_case_4(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree, u64 start) +{ + struct extent_map *em; + u64 len = SZ_4K; + int ret; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + /* Add [0K, 8K) */ + em->start = 0; + em->len = SZ_8K; + em->block_start = 0; + em->block_len = SZ_8K; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("cannot add extent range [0, 8K)"); + goto out; + } + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* Add [8K, 32K) */ + em->start = SZ_8K; + em->len = 24 * SZ_1K; + em->block_start = SZ_16K; /* avoid merging */ + em->block_len = 24 * SZ_1K; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("cannot add extent range [8K, 32K)"); + goto out; + } + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + /* Add [0K, 32K) */ + em->start = 0; + em->len = SZ_32K; + em->block_start = 0; + em->block_len = SZ_32K; + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + write_unlock(&em_tree->lock); + if (ret) { + test_err("case4 [0x%llx 0x%llx): ret %d", + start, len, ret); + goto out; + } + if (em && (start < em->start || start + len > extent_map_end(em))) { + test_err( +"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", + start, len, ret, em->start, em->len, em->block_start, + em->block_len); + ret = -EINVAL; + } + free_extent_map(em); +out: + free_extent_map_tree(em_tree); + + return ret; +} + +/* + * Test scenario: + * + * Suppose that no extent map has been loaded into memory yet. + * There is a file extent [0, 32K), two jobs are running concurrently + * against it, t1 is doing dio write to [8K, 32K) and t2 is doing dio + * read from [0, 4K) or [4K, 8K). + * + * t1 goes ahead of t2 and splits em [0, 32K) to em [0K, 8K) and [8K 32K). + * + * t1 t2 + * btrfs_get_blocks_direct() btrfs_get_blocks_direct() + * -> btrfs_get_extent() -> btrfs_get_extent() + * -> lookup_extent_mapping() + * -> add_extent_mapping() -> lookup_extent_mapping() + * # load [0, 32K) + * -> btrfs_new_extent_direct() + * -> btrfs_drop_extent_cache() + * # split [0, 32K) + * -> add_extent_mapping() + * # add [8K, 32K) + * -> add_extent_mapping() + * # handle -EEXIST when adding + * # [0, 32K) + */ +static int test_case_4(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree) +{ + int ret; + + ret = __test_case_4(fs_info, em_tree, 0); + if (ret) + return ret; + ret = __test_case_4(fs_info, em_tree, SZ_4K); + + return ret; +} + +static int add_compressed_extent(struct extent_map_tree *em_tree, + u64 start, u64 len, u64 block_start) +{ + struct extent_map *em; + int ret; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + em->start = start; + em->len = len; + em->block_start = block_start; + em->block_len = SZ_4K; + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + free_extent_map(em); + if (ret < 0) { + test_err("cannot add extent map [%llu, %llu)", start, start + len); + return ret; + } + + return 0; +} + +struct extent_range { + u64 start; + u64 len; +}; + +/* The valid states of the tree after every drop, as described below. */ +struct extent_range valid_ranges[][7] = { + { + { .start = 0, .len = SZ_8K }, /* [0, 8K) */ + { .start = SZ_4K * 3, .len = SZ_4K * 3}, /* [12k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K * 3}, /* [24k, 36k) */ + { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */ + { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */ + }, + { + { .start = 0, .len = SZ_8K }, /* [0, 8K) */ + { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K * 3}, /* [24k, 36k) */ + { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */ + { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */ + }, + { + { .start = 0, .len = SZ_8K }, /* [0, 8K) */ + { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K}, /* [24k, 28k) */ + { .start = SZ_32K, .len = SZ_4K}, /* [32k, 36k) */ + { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */ + { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */ + }, + { + { .start = 0, .len = SZ_8K}, /* [0, 8K) */ + { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K}, /* [24k, 28k) */ + } +}; + +static int validate_range(struct extent_map_tree *em_tree, int index) +{ + struct rb_node *n; + int i; + + for (i = 0, n = rb_first_cached(&em_tree->map); + valid_ranges[index][i].len && n; + i++, n = rb_next(n)) { + struct extent_map *entry = rb_entry(n, struct extent_map, rb_node); + + if (entry->start != valid_ranges[index][i].start) { + test_err("mapping has start %llu expected %llu", + entry->start, valid_ranges[index][i].start); + return -EINVAL; + } + + if (entry->len != valid_ranges[index][i].len) { + test_err("mapping has len %llu expected %llu", + entry->len, valid_ranges[index][i].len); + return -EINVAL; + } + } + + /* + * We exited because we don't have any more entries in the extent_map + * but we still expect more valid entries. + */ + if (valid_ranges[index][i].len) { + test_err("missing an entry"); + return -EINVAL; + } + + /* We exited the loop but still have entries in the extent map. */ + if (n) { + test_err("we have a left over entry in the extent map we didn't expect"); + return -EINVAL; + } + + return 0; +} + +/* + * Test scenario: + * + * Test the various edge cases of btrfs_drop_extent_map_range, create the + * following ranges + * + * [0, 12k)[12k, 24k)[24k, 36k)[36k, 40k)[40k,64k) + * + * And then we'll drop: + * + * [8k, 12k) - test the single front split + * [12k, 20k) - test the single back split + * [28k, 32k) - test the double split + * [32k, 64k) - test whole em dropping + * + * They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from + * merging the em's. + */ +static int test_case_5(void) +{ + struct extent_map_tree *em_tree; + struct inode *inode; + u64 start, end; + int ret; + + test_msg("Running btrfs_drop_extent_map_range tests"); + + inode = btrfs_new_test_inode(); + if (!inode) { + test_std_err(TEST_ALLOC_INODE); + return -ENOMEM; + } + + em_tree = &BTRFS_I(inode)->extent_tree; + + /* [0, 12k) */ + ret = add_compressed_extent(em_tree, 0, SZ_4K * 3, 0); + if (ret) { + test_err("cannot add extent range [0, 12K)"); + goto out; + } + + /* [12k, 24k) */ + ret = add_compressed_extent(em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* [24k, 36k) */ + ret = add_compressed_extent(em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* [36k, 40k) */ + ret = add_compressed_extent(em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* [40k, 64k) */ + ret = add_compressed_extent(em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* Drop [8k, 12k) */ + start = SZ_8K; + end = (3 * SZ_4K) - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 0); + if (ret) + goto out; + + /* Drop [12k, 20k) */ + start = SZ_4K * 3; + end = SZ_16K + SZ_4K - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 1); + if (ret) + goto out; + + /* Drop [28k, 32k) */ + start = SZ_32K - SZ_4K; + end = SZ_32K - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 2); + if (ret) + goto out; + + /* Drop [32k, 64k) */ + start = SZ_32K; + end = SZ_64K - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 3); + if (ret) + goto out; +out: + iput(inode); + return ret; +} + +/* + * Test the btrfs_add_extent_mapping helper which will attempt to create an em + * for areas between two existing ems. Validate it doesn't do this when there + * are two unmerged em's side by side. + */ +static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree) +{ + struct extent_map *em = NULL; + int ret; + + ret = add_compressed_extent(em_tree, 0, SZ_4K, 0); + if (ret) + goto out; + + ret = add_compressed_extent(em_tree, SZ_4K, SZ_4K, 0); + if (ret) + goto out; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + em->start = SZ_4K; + em->len = SZ_4K; + em->block_start = SZ_16K; + em->block_len = SZ_16K; + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, 0, SZ_8K); + write_unlock(&em_tree->lock); + + if (ret != 0) { + test_err("got an error when adding our em: %d", ret); + goto out; + } + + ret = -EINVAL; + if (em->start != 0) { + test_err("unexpected em->start at %llu, wanted 0", em->start); + goto out; + } + if (em->len != SZ_4K) { + test_err("unexpected em->len %llu, expected 4K", em->len); + goto out; + } + ret = 0; +out: + free_extent_map(em); + free_extent_map_tree(em_tree); + return ret; +} + +/* + * Regression test for btrfs_drop_extent_map_range. Calling with skip_pinned == + * true would mess up the start/end calculations and subsequent splits would be + * incorrect. + */ +static int test_case_7(void) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct inode *inode; + int ret; + + test_msg("Running btrfs_drop_extent_cache with pinned"); + + inode = btrfs_new_test_inode(); + if (!inode) { + test_std_err(TEST_ALLOC_INODE); + return -ENOMEM; + } + + em_tree = &BTRFS_I(inode)->extent_tree; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* [0, 16K), pinned */ + em->start = 0; + em->len = SZ_16K; + em->block_start = 0; + em->block_len = SZ_4K; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("couldn't add extent map"); + goto out; + } + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* [32K, 48K), not pinned */ + em->start = SZ_32K; + em->len = SZ_16K; + em->block_start = SZ_32K; + em->block_len = SZ_16K; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("couldn't add extent map"); + goto out; + } + free_extent_map(em); + + /* + * Drop [0, 36K) This should skip the [0, 4K) extent and then split the + * [32K, 48K) extent. + */ + btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (36 * SZ_1K) - 1, true); + + /* Make sure our extent maps look sane. */ + ret = -EINVAL; + + em = lookup_extent_mapping(em_tree, 0, SZ_16K); + if (!em) { + test_err("didn't find an em at 0 as expected"); + goto out; + } + + if (em->start != 0) { + test_err("em->start is %llu, expected 0", em->start); + goto out; + } + + if (em->len != SZ_16K) { + test_err("em->len is %llu, expected 16K", em->len); + goto out; + } + + free_extent_map(em); + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, SZ_16K, SZ_16K); + read_unlock(&em_tree->lock); + if (em) { + test_err("found an em when we weren't expecting one"); + goto out; + } + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, SZ_32K, SZ_16K); + read_unlock(&em_tree->lock); + if (!em) { + test_err("didn't find an em at 32K as expected"); + goto out; + } + + if (em->start != (36 * SZ_1K)) { + test_err("em->start is %llu, expected 36K", em->start); + goto out; + } + + if (em->len != (12 * SZ_1K)) { + test_err("em->len is %llu, expected 12K", em->len); + goto out; + } + + free_extent_map(em); + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1); + read_unlock(&em_tree->lock); + if (em) { + test_err("found an unexpected em above 48K"); + goto out; + } + + ret = 0; +out: + free_extent_map(em); + iput(inode); + return ret; +} + +struct rmap_test_vector { + u64 raid_type; + u64 physical_start; + u64 data_stripe_size; + u64 num_data_stripes; + u64 num_stripes; + /* Assume we won't have more than 5 physical stripes */ + u64 data_stripe_phys_start[5]; + bool expected_mapped_addr; + /* Physical to logical addresses */ + u64 mapped_logical[5]; +}; + +static int test_rmap_block(struct btrfs_fs_info *fs_info, + struct rmap_test_vector *test) +{ + struct extent_map *em; + struct map_lookup *map = NULL; + u64 *logical = NULL; + int i, out_ndaddrs, out_stripe_len; + int ret; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + map = kmalloc(map_lookup_size(test->num_stripes), GFP_KERNEL); + if (!map) { + kfree(em); + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); + /* Start at 4GiB logical address */ + em->start = SZ_4G; + em->len = test->data_stripe_size * test->num_data_stripes; + em->block_len = em->len; + em->orig_block_len = test->data_stripe_size; + em->map_lookup = map; + + map->num_stripes = test->num_stripes; + map->type = test->raid_type; + + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *dev = btrfs_alloc_dummy_device(fs_info); + + if (IS_ERR(dev)) { + test_err("cannot allocate device"); + ret = PTR_ERR(dev); + goto out; + } + map->stripes[i].dev = dev; + map->stripes[i].physical = test->data_stripe_phys_start[i]; + } + + write_lock(&fs_info->mapping_tree.lock); + ret = add_extent_mapping(&fs_info->mapping_tree, em, 0); + write_unlock(&fs_info->mapping_tree.lock); + if (ret) { + test_err("error adding block group mapping to mapping tree"); + goto out_free; + } + + ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), + &logical, &out_ndaddrs, &out_stripe_len); + if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { + test_err("didn't rmap anything but expected %d", + test->expected_mapped_addr); + goto out; + } + + if (out_stripe_len != BTRFS_STRIPE_LEN) { + test_err("calculated stripe length doesn't match"); + goto out; + } + + if (out_ndaddrs != test->expected_mapped_addr) { + for (i = 0; i < out_ndaddrs; i++) + test_msg("mapped %llu", logical[i]); + test_err("unexpected number of mapped addresses: %d", out_ndaddrs); + goto out; + } + + for (i = 0; i < out_ndaddrs; i++) { + if (logical[i] != test->mapped_logical[i]) { + test_err("unexpected logical address mapped"); + goto out; + } + } + + ret = 0; +out: + write_lock(&fs_info->mapping_tree.lock); + remove_extent_mapping(&fs_info->mapping_tree, em); + write_unlock(&fs_info->mapping_tree.lock); + /* For us */ + free_extent_map(em); +out_free: + /* For the tree */ + free_extent_map(em); + kfree(logical); + return ret; +} + +int btrfs_test_extent_map(void) +{ + struct btrfs_fs_info *fs_info = NULL; + struct extent_map_tree *em_tree; + int ret = 0, i; + struct rmap_test_vector rmap_tests[] = { + { + /* + * Test a chunk with 2 data stripes one of which + * intersects the physical address of the super block + * is correctly recognised. + */ + .raid_type = BTRFS_BLOCK_GROUP_RAID1, + .physical_start = SZ_64M - SZ_4M, + .data_stripe_size = SZ_256M, + .num_data_stripes = 2, + .num_stripes = 2, + .data_stripe_phys_start = + {SZ_64M - SZ_4M, SZ_64M - SZ_4M + SZ_256M}, + .expected_mapped_addr = true, + .mapped_logical= {SZ_4G + SZ_4M} + }, + { + /* + * Test that out-of-range physical addresses are + * ignored + */ + + /* SINGLE chunk type */ + .raid_type = 0, + .physical_start = SZ_4G, + .data_stripe_size = SZ_256M, + .num_data_stripes = 1, + .num_stripes = 1, + .data_stripe_phys_start = {SZ_256M}, + .expected_mapped_addr = false, + .mapped_logical = {0} + } + }; + + test_msg("running extent_map tests"); + + /* + * Note: the fs_info is not set up completely, we only need + * fs_info::fsid for the tracepoint. + */ + fs_info = btrfs_alloc_dummy_fs_info(PAGE_SIZE, PAGE_SIZE); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL); + if (!em_tree) { + ret = -ENOMEM; + goto out; + } + + extent_map_tree_init(em_tree); + + ret = test_case_1(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_2(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_3(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_4(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_5(); + if (ret) + goto out; + ret = test_case_6(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_7(); + if (ret) + goto out; + + test_msg("running rmap tests"); + for (i = 0; i < ARRAY_SIZE(rmap_tests); i++) { + ret = test_rmap_block(fs_info, &rmap_tests[i]); + if (ret) + goto out; + } + +out: + kfree(em_tree); + btrfs_free_dummy_fs_info(fs_info); + + return ret; +} diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c new file mode 100644 index 0000000000..ebf68fcd21 --- /dev/null +++ b/fs/btrfs/tests/free-space-tests.c @@ -0,0 +1,1063 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + */ + +#include +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../disk-io.h" +#include "../free-space-cache.h" +#include "../block-group.h" + +#define BITS_PER_BITMAP (PAGE_SIZE * 8UL) + +/* + * This test just does basic sanity checking, making sure we can add an extent + * entry and remove space from either end and the middle, and make sure we can + * remove space that covers adjacent extent entries. + */ +static int test_extents(struct btrfs_block_group *cache) +{ + int ret = 0; + + test_msg("running extent only tests"); + + /* First just make sure we can remove an entire entry */ + ret = btrfs_add_free_space(cache, 0, SZ_4M); + if (ret) { + test_err("error adding initial extents %d", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, SZ_4M); + if (ret) { + test_err("error removing extent %d", ret); + return ret; + } + + if (test_check_exists(cache, 0, SZ_4M)) { + test_err("full remove left some lingering space"); + return -1; + } + + /* Ok edge and middle cases now */ + ret = btrfs_add_free_space(cache, 0, SZ_4M); + if (ret) { + test_err("error adding half extent %d", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_1M); + if (ret) { + test_err("error removing tail end %d", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, SZ_1M); + if (ret) { + test_err("error removing front end %d", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, SZ_2M, 4096); + if (ret) { + test_err("error removing middle piece %d", ret); + return ret; + } + + if (test_check_exists(cache, 0, SZ_1M)) { + test_err("still have space at the front"); + return -1; + } + + if (test_check_exists(cache, SZ_2M, 4096)) { + test_err("still have space in the middle"); + return -1; + } + + if (test_check_exists(cache, 3 * SZ_1M, SZ_1M)) { + test_err("still have space at the end"); + return -1; + } + + /* Cleanup */ + btrfs_remove_free_space_cache(cache); + + return 0; +} + +static int test_bitmaps(struct btrfs_block_group *cache, u32 sectorsize) +{ + u64 next_bitmap_offset; + int ret; + + test_msg("running bitmap only tests"); + + ret = test_add_free_space_entry(cache, 0, SZ_4M, 1); + if (ret) { + test_err("couldn't create a bitmap entry %d", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, SZ_4M); + if (ret) { + test_err("error removing bitmap full range %d", ret); + return ret; + } + + if (test_check_exists(cache, 0, SZ_4M)) { + test_err("left some space in bitmap"); + return -1; + } + + ret = test_add_free_space_entry(cache, 0, SZ_4M, 1); + if (ret) { + test_err("couldn't add to our bitmap entry %d", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, SZ_1M, SZ_2M); + if (ret) { + test_err("couldn't remove middle chunk %d", ret); + return ret; + } + + /* + * The first bitmap we have starts at offset 0 so the next one is just + * at the end of the first bitmap. + */ + next_bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize); + + /* Test a bit straddling two bitmaps */ + ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M, + SZ_4M, 1); + if (ret) { + test_err("couldn't add space that straddles two bitmaps %d", + ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, next_bitmap_offset - SZ_1M, SZ_2M); + if (ret) { + test_err("couldn't remove overlapping space %d", ret); + return ret; + } + + if (test_check_exists(cache, next_bitmap_offset - SZ_1M, SZ_2M)) { + test_err("left some space when removing overlapping"); + return -1; + } + + btrfs_remove_free_space_cache(cache); + + return 0; +} + +/* This is the high grade jackassery */ +static int test_bitmaps_and_extents(struct btrfs_block_group *cache, + u32 sectorsize) +{ + u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize); + int ret; + + test_msg("running bitmap and extent tests"); + + /* + * First let's do something simple, an extent at the same offset as the + * bitmap, but the free space completely in the extent and then + * completely in the bitmap. + */ + ret = test_add_free_space_entry(cache, SZ_4M, SZ_1M, 1); + if (ret) { + test_err("couldn't create bitmap entry %d", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, 0, SZ_1M, 0); + if (ret) { + test_err("couldn't add extent entry %d", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, SZ_1M); + if (ret) { + test_err("couldn't remove extent entry %d", ret); + return ret; + } + + if (test_check_exists(cache, 0, SZ_1M)) { + test_err("left remnants after our remove"); + return -1; + } + + /* Now to add back the extent entry and remove from the bitmap */ + ret = test_add_free_space_entry(cache, 0, SZ_1M, 0); + if (ret) { + test_err("couldn't re-add extent entry %d", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, SZ_4M, SZ_1M); + if (ret) { + test_err("couldn't remove from bitmap %d", ret); + return ret; + } + + if (test_check_exists(cache, SZ_4M, SZ_1M)) { + test_err("left remnants in the bitmap"); + return -1; + } + + /* + * Ok so a little more evil, extent entry and bitmap at the same offset, + * removing an overlapping chunk. + */ + ret = test_add_free_space_entry(cache, SZ_1M, SZ_4M, 1); + if (ret) { + test_err("couldn't add to a bitmap %d", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, SZ_512K, 3 * SZ_1M); + if (ret) { + test_err("couldn't remove overlapping space %d", ret); + return ret; + } + + if (test_check_exists(cache, SZ_512K, 3 * SZ_1M)) { + test_err("left over pieces after removing overlapping"); + return -1; + } + + btrfs_remove_free_space_cache(cache); + + /* Now with the extent entry offset into the bitmap */ + ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1); + if (ret) { + test_err("couldn't add space to the bitmap %d", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, SZ_2M, SZ_2M, 0); + if (ret) { + test_err("couldn't add extent to the cache %d", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_4M); + if (ret) { + test_err("problem removing overlapping space %d", ret); + return ret; + } + + if (test_check_exists(cache, 3 * SZ_1M, SZ_4M)) { + test_err("left something behind when removing space"); + return -1; + } + + /* + * This has blown up in the past, the extent entry starts before the + * bitmap entry, but we're trying to remove an offset that falls + * completely within the bitmap range and is in both the extent entry + * and the bitmap entry, looks like this + * + * [ extent ] + * [ bitmap ] + * [ del ] + */ + btrfs_remove_free_space_cache(cache); + ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1); + if (ret) { + test_err("couldn't add bitmap %d", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, bitmap_offset - SZ_1M, + 5 * SZ_1M, 0); + if (ret) { + test_err("couldn't add extent entry %d", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, bitmap_offset + SZ_1M, 5 * SZ_1M); + if (ret) { + test_err("failed to free our space %d", ret); + return ret; + } + + if (test_check_exists(cache, bitmap_offset + SZ_1M, 5 * SZ_1M)) { + test_err("left stuff over"); + return -1; + } + + btrfs_remove_free_space_cache(cache); + + /* + * This blew up before, we have part of the free space in a bitmap and + * then the entirety of the rest of the space in an extent. This used + * to return -EAGAIN back from btrfs_remove_extent, make sure this + * doesn't happen. + */ + ret = test_add_free_space_entry(cache, SZ_1M, SZ_2M, 1); + if (ret) { + test_err("couldn't add bitmap entry %d", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, 3 * SZ_1M, SZ_1M, 0); + if (ret) { + test_err("couldn't add extent entry %d", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, SZ_1M, 3 * SZ_1M); + if (ret) { + test_err("error removing bitmap and extent overlapping %d", ret); + return ret; + } + + btrfs_remove_free_space_cache(cache); + return 0; +} + +/* Used by test_steal_space_from_bitmap_to_extent(). */ +static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + return ctl->free_extents > 0; +} + +/* Used by test_steal_space_from_bitmap_to_extent(). */ +static int +check_num_extents_and_bitmaps(const struct btrfs_block_group *cache, + const int num_extents, + const int num_bitmaps) +{ + if (cache->free_space_ctl->free_extents != num_extents) { + test_err( + "incorrect # of extent entries in the cache: %d, expected %d", + cache->free_space_ctl->free_extents, num_extents); + return -EINVAL; + } + if (cache->free_space_ctl->total_bitmaps != num_bitmaps) { + test_err( + "incorrect # of extent entries in the cache: %d, expected %d", + cache->free_space_ctl->total_bitmaps, num_bitmaps); + return -EINVAL; + } + return 0; +} + +/* Used by test_steal_space_from_bitmap_to_extent(). */ +static int check_cache_empty(struct btrfs_block_group *cache) +{ + u64 offset; + u64 max_extent_size; + + /* + * Now lets confirm that there's absolutely no free space left to + * allocate. + */ + if (cache->free_space_ctl->free_space != 0) { + test_err("cache free space is not 0"); + return -EINVAL; + } + + /* And any allocation request, no matter how small, should fail now. */ + offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0, + &max_extent_size); + if (offset != 0) { + test_err("space allocation did not fail, returned offset: %llu", + offset); + return -EINVAL; + } + + /* And no extent nor bitmap entries in the cache anymore. */ + return check_num_extents_and_bitmaps(cache, 0, 0); +} + +/* + * Before we were able to steal free space from a bitmap entry to an extent + * entry, we could end up with 2 entries representing a contiguous free space. + * One would be an extent entry and the other a bitmap entry. Since in order + * to allocate space to a caller we use only 1 entry, we couldn't return that + * whole range to the caller if it was requested. This forced the caller to + * either assume ENOSPC or perform several smaller space allocations, which + * wasn't optimal as they could be spread all over the block group while under + * concurrency (extra overhead and fragmentation). + * + * This stealing approach is beneficial, since we always prefer to allocate + * from extent entries, both for clustered and non-clustered allocation + * requests. + */ +static int +test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache, + u32 sectorsize) +{ + int ret; + u64 offset; + u64 max_extent_size; + const struct btrfs_free_space_op test_free_space_ops = { + .use_bitmap = test_use_bitmap, + }; + const struct btrfs_free_space_op *orig_free_space_ops; + + test_msg("running space stealing from bitmap to extent tests"); + + /* + * For this test, we want to ensure we end up with an extent entry + * immediately adjacent to a bitmap entry, where the bitmap starts + * at an offset where the extent entry ends. We keep adding and + * removing free space to reach into this state, but to get there + * we need to reach a point where marking new free space doesn't + * result in adding new extent entries or merging the new space + * with existing extent entries - the space ends up being marked + * in an existing bitmap that covers the new free space range. + * + * To get there, we need to reach the threshold defined set at + * cache->free_space_ctl->extents_thresh, which currently is + * 256 extents on a x86_64 system at least, and a few other + * conditions (check free_space_cache.c). Instead of making the + * test much longer and complicated, use a "use_bitmap" operation + * that forces use of bitmaps as soon as we have at least 1 + * extent entry. + */ + orig_free_space_ops = cache->free_space_ctl->op; + cache->free_space_ctl->op = &test_free_space_ops; + + /* + * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[ + */ + ret = test_add_free_space_entry(cache, SZ_128M - SZ_256K, SZ_128K, 0); + if (ret) { + test_err("couldn't add extent entry %d", ret); + return ret; + } + + /* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */ + ret = test_add_free_space_entry(cache, SZ_128M + SZ_512K, + SZ_128M - SZ_512K, 1); + if (ret) { + test_err("couldn't add bitmap entry %d", ret); + return ret; + } + + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now make only the first 256Kb of the bitmap marked as free, so that + * we end up with only the following ranges marked as free space: + * + * [128Mb - 256Kb, 128Mb - 128Kb[ + * [128Mb + 512Kb, 128Mb + 768Kb[ + */ + ret = btrfs_remove_free_space(cache, + SZ_128M + 768 * SZ_1K, + SZ_128M - 768 * SZ_1K); + if (ret) { + test_err("failed to free part of bitmap space %d", ret); + return ret; + } + + /* Confirm that only those 2 ranges are marked as free. */ + if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_128K)) { + test_err("free space range missing"); + return -ENOENT; + } + if (!test_check_exists(cache, SZ_128M + SZ_512K, SZ_256K)) { + test_err("free space range missing"); + return -ENOENT; + } + + /* + * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked + * as free anymore. + */ + if (test_check_exists(cache, SZ_128M + 768 * SZ_1K, + SZ_128M - 768 * SZ_1K)) { + test_err("bitmap region not removed from space cache"); + return -EINVAL; + } + + /* + * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is + * covered by the bitmap, isn't marked as free. + */ + if (test_check_exists(cache, SZ_128M + SZ_256K, SZ_256K)) { + test_err("invalid bitmap region marked as free"); + return -EINVAL; + } + + /* + * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered + * by the bitmap too, isn't marked as free either. + */ + if (test_check_exists(cache, SZ_128M, SZ_256K)) { + test_err("invalid bitmap region marked as free"); + return -EINVAL; + } + + /* + * Now lets mark the region [128Mb, 128Mb + 512Kb[ as free too. But, + * lets make sure the free space cache marks it as free in the bitmap, + * and doesn't insert a new extent entry to represent this region. + */ + ret = btrfs_add_free_space(cache, SZ_128M, SZ_512K); + if (ret) { + test_err("error adding free space: %d", ret); + return ret; + } + /* Confirm the region is marked as free. */ + if (!test_check_exists(cache, SZ_128M, SZ_512K)) { + test_err("bitmap region not marked as free"); + return -ENOENT; + } + + /* + * Confirm that no new extent entries or bitmap entries were added to + * the cache after adding that free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now lets add a small free space region to the right of the previous + * one, which is not contiguous with it and is part of the bitmap too. + * The goal is to test that the bitmap entry space stealing doesn't + * steal this space region. + */ + ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, sectorsize); + if (ret) { + test_err("error adding free space: %d", ret); + return ret; + } + + /* + * Confirm that no new extent entries or bitmap entries were added to + * the cache after adding that free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now mark the region [128Mb - 128Kb, 128Mb[ as free too. This will + * expand the range covered by the existing extent entry that represents + * the free space [128Mb - 256Kb, 128Mb - 128Kb[. + */ + ret = btrfs_add_free_space(cache, SZ_128M - SZ_128K, SZ_128K); + if (ret) { + test_err("error adding free space: %d", ret); + return ret; + } + /* Confirm the region is marked as free. */ + if (!test_check_exists(cache, SZ_128M - SZ_128K, SZ_128K)) { + test_err("extent region not marked as free"); + return -ENOENT; + } + + /* + * Confirm that our extent entry didn't stole all free space from the + * bitmap, because of the small 4Kb free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * So now we have the range [128Mb - 256Kb, 128Mb + 768Kb[ as free + * space. Without stealing bitmap free space into extent entry space, + * we would have all this free space represented by 2 entries in the + * cache: + * + * extent entry covering range: [128Mb - 256Kb, 128Mb[ + * bitmap entry covering range: [128Mb, 128Mb + 768Kb[ + * + * Attempting to allocate the whole free space (1Mb) would fail, because + * we can't allocate from multiple entries. + * With the bitmap free space stealing, we get a single extent entry + * that represents the 1Mb free space, and therefore we're able to + * allocate the whole free space at once. + */ + if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_1M)) { + test_err("expected region not marked as free"); + return -ENOENT; + } + + if (cache->free_space_ctl->free_space != (SZ_1M + sectorsize)) { + test_err("cache free space is not 1Mb + %u", sectorsize); + return -EINVAL; + } + + offset = btrfs_find_space_for_alloc(cache, + 0, SZ_1M, 0, + &max_extent_size); + if (offset != (SZ_128M - SZ_256K)) { + test_err( + "failed to allocate 1Mb from space cache, returned offset is: %llu", + offset); + return -EINVAL; + } + + /* + * All that remains is a sectorsize free space region in a bitmap. + * Confirm. + */ + ret = check_num_extents_and_bitmaps(cache, 1, 1); + if (ret) + return ret; + + if (cache->free_space_ctl->free_space != sectorsize) { + test_err("cache free space is not %u", sectorsize); + return -EINVAL; + } + + offset = btrfs_find_space_for_alloc(cache, + 0, sectorsize, 0, + &max_extent_size); + if (offset != (SZ_128M + SZ_16M)) { + test_err("failed to allocate %u, returned offset : %llu", + sectorsize, offset); + return -EINVAL; + } + + ret = check_cache_empty(cache); + if (ret) + return ret; + + btrfs_remove_free_space_cache(cache); + + /* + * Now test a similar scenario, but where our extent entry is located + * to the right of the bitmap entry, so that we can check that stealing + * space from a bitmap to the front of an extent entry works. + */ + + /* + * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[ + */ + ret = test_add_free_space_entry(cache, SZ_128M + SZ_128K, SZ_128K, 0); + if (ret) { + test_err("couldn't add extent entry %d", ret); + return ret; + } + + /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */ + ret = test_add_free_space_entry(cache, 0, SZ_128M - SZ_512K, 1); + if (ret) { + test_err("couldn't add bitmap entry %d", ret); + return ret; + } + + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now make only the last 256Kb of the bitmap marked as free, so that + * we end up with only the following ranges marked as free space: + * + * [128Mb + 128b, 128Mb + 256Kb[ + * [128Mb - 768Kb, 128Mb - 512Kb[ + */ + ret = btrfs_remove_free_space(cache, 0, SZ_128M - 768 * SZ_1K); + if (ret) { + test_err("failed to free part of bitmap space %d", ret); + return ret; + } + + /* Confirm that only those 2 ranges are marked as free. */ + if (!test_check_exists(cache, SZ_128M + SZ_128K, SZ_128K)) { + test_err("free space range missing"); + return -ENOENT; + } + if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_256K)) { + test_err("free space range missing"); + return -ENOENT; + } + + /* + * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked + * as free anymore. + */ + if (test_check_exists(cache, 0, SZ_128M - 768 * SZ_1K)) { + test_err("bitmap region not removed from space cache"); + return -EINVAL; + } + + /* + * Confirm that the region [128Mb - 512Kb, 128Mb[, which is + * covered by the bitmap, isn't marked as free. + */ + if (test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) { + test_err("invalid bitmap region marked as free"); + return -EINVAL; + } + + /* + * Now lets mark the region [128Mb - 512Kb, 128Mb[ as free too. But, + * lets make sure the free space cache marks it as free in the bitmap, + * and doesn't insert a new extent entry to represent this region. + */ + ret = btrfs_add_free_space(cache, SZ_128M - SZ_512K, SZ_512K); + if (ret) { + test_err("error adding free space: %d", ret); + return ret; + } + /* Confirm the region is marked as free. */ + if (!test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) { + test_err("bitmap region not marked as free"); + return -ENOENT; + } + + /* + * Confirm that no new extent entries or bitmap entries were added to + * the cache after adding that free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now lets add a small free space region to the left of the previous + * one, which is not contiguous with it and is part of the bitmap too. + * The goal is to test that the bitmap entry space stealing doesn't + * steal this space region. + */ + ret = btrfs_add_free_space(cache, SZ_32M, 2 * sectorsize); + if (ret) { + test_err("error adding free space: %d", ret); + return ret; + } + + /* + * Now mark the region [128Mb, 128Mb + 128Kb[ as free too. This will + * expand the range covered by the existing extent entry that represents + * the free space [128Mb + 128Kb, 128Mb + 256Kb[. + */ + ret = btrfs_add_free_space(cache, SZ_128M, SZ_128K); + if (ret) { + test_err("error adding free space: %d", ret); + return ret; + } + /* Confirm the region is marked as free. */ + if (!test_check_exists(cache, SZ_128M, SZ_128K)) { + test_err("extent region not marked as free"); + return -ENOENT; + } + + /* + * Confirm that our extent entry didn't stole all free space from the + * bitmap, because of the small 2 * sectorsize free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * So now we have the range [128Mb - 768Kb, 128Mb + 256Kb[ as free + * space. Without stealing bitmap free space into extent entry space, + * we would have all this free space represented by 2 entries in the + * cache: + * + * extent entry covering range: [128Mb, 128Mb + 256Kb[ + * bitmap entry covering range: [128Mb - 768Kb, 128Mb[ + * + * Attempting to allocate the whole free space (1Mb) would fail, because + * we can't allocate from multiple entries. + * With the bitmap free space stealing, we get a single extent entry + * that represents the 1Mb free space, and therefore we're able to + * allocate the whole free space at once. + */ + if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_1M)) { + test_err("expected region not marked as free"); + return -ENOENT; + } + + if (cache->free_space_ctl->free_space != (SZ_1M + 2 * sectorsize)) { + test_err("cache free space is not 1Mb + %u", 2 * sectorsize); + return -EINVAL; + } + + offset = btrfs_find_space_for_alloc(cache, 0, SZ_1M, 0, + &max_extent_size); + if (offset != (SZ_128M - 768 * SZ_1K)) { + test_err( + "failed to allocate 1Mb from space cache, returned offset is: %llu", + offset); + return -EINVAL; + } + + /* + * All that remains is 2 * sectorsize free space region + * in a bitmap. Confirm. + */ + ret = check_num_extents_and_bitmaps(cache, 1, 1); + if (ret) + return ret; + + if (cache->free_space_ctl->free_space != 2 * sectorsize) { + test_err("cache free space is not %u", 2 * sectorsize); + return -EINVAL; + } + + offset = btrfs_find_space_for_alloc(cache, + 0, 2 * sectorsize, 0, + &max_extent_size); + if (offset != SZ_32M) { + test_err("failed to allocate %u, offset: %llu", + 2 * sectorsize, offset); + return -EINVAL; + } + + ret = check_cache_empty(cache); + if (ret) + return ret; + + cache->free_space_ctl->op = orig_free_space_ops; + btrfs_remove_free_space_cache(cache); + + return 0; +} + +static bool bytes_index_use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + return true; +} + +static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize) +{ + const struct btrfs_free_space_op test_free_space_ops = { + .use_bitmap = bytes_index_use_bitmap, + }; + const struct btrfs_free_space_op *orig_free_space_ops; + struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; + struct btrfs_free_space *entry; + struct rb_node *node; + u64 offset, max_extent_size, bytes; + int ret, i; + + test_msg("running bytes index tests"); + + /* First just validate that it does everything in order. */ + offset = 0; + for (i = 0; i < 10; i++) { + bytes = (i + 1) * SZ_1M; + ret = test_add_free_space_entry(cache, offset, bytes, 0); + if (ret) { + test_err("couldn't add extent entry %d\n", ret); + return ret; + } + offset += bytes + sectorsize; + } + + for (node = rb_first_cached(&ctl->free_space_bytes), i = 9; node; + node = rb_next(node), i--) { + entry = rb_entry(node, struct btrfs_free_space, bytes_index); + bytes = (i + 1) * SZ_1M; + if (entry->bytes != bytes) { + test_err("invalid bytes index order, found %llu expected %llu", + entry->bytes, bytes); + return -EINVAL; + } + } + + /* Now validate bitmaps do the correct thing. */ + btrfs_remove_free_space_cache(cache); + for (i = 0; i < 2; i++) { + offset = i * BITS_PER_BITMAP * sectorsize; + bytes = (i + 1) * SZ_1M; + ret = test_add_free_space_entry(cache, offset, bytes, 1); + if (ret) { + test_err("couldn't add bitmap entry"); + return ret; + } + } + + for (node = rb_first_cached(&ctl->free_space_bytes), i = 1; node; + node = rb_next(node), i--) { + entry = rb_entry(node, struct btrfs_free_space, bytes_index); + bytes = (i + 1) * SZ_1M; + if (entry->bytes != bytes) { + test_err("invalid bytes index order, found %llu expected %llu", + entry->bytes, bytes); + return -EINVAL; + } + } + + /* Now validate bitmaps with different ->max_extent_size. */ + btrfs_remove_free_space_cache(cache); + orig_free_space_ops = cache->free_space_ctl->op; + cache->free_space_ctl->op = &test_free_space_ops; + + ret = test_add_free_space_entry(cache, 0, sectorsize, 1); + if (ret) { + test_err("couldn't add bitmap entry"); + return ret; + } + + offset = BITS_PER_BITMAP * sectorsize; + ret = test_add_free_space_entry(cache, offset, sectorsize, 1); + if (ret) { + test_err("couldn't add bitmap_entry"); + return ret; + } + + /* + * Now set a bunch of sectorsize extents in the first entry so it's + * ->bytes is large. + */ + for (i = 2; i < 20; i += 2) { + offset = sectorsize * i; + ret = btrfs_add_free_space(cache, offset, sectorsize); + if (ret) { + test_err("error populating sparse bitmap %d", ret); + return ret; + } + } + + /* + * Now set a contiguous extent in the second bitmap so its + * ->max_extent_size is larger than the first bitmaps. + */ + offset = (BITS_PER_BITMAP * sectorsize) + sectorsize; + ret = btrfs_add_free_space(cache, offset, sectorsize); + if (ret) { + test_err("error adding contiguous extent %d", ret); + return ret; + } + + /* + * Since we don't set ->max_extent_size unless we search everything + * should be indexed on bytes. + */ + entry = rb_entry(rb_first_cached(&ctl->free_space_bytes), + struct btrfs_free_space, bytes_index); + if (entry->bytes != (10 * sectorsize)) { + test_err("error, wrong entry in the first slot in bytes_index"); + return -EINVAL; + } + + max_extent_size = 0; + offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 3, + 0, &max_extent_size); + if (offset != 0) { + test_err("found space to alloc even though we don't have enough space"); + return -EINVAL; + } + + if (max_extent_size != (2 * sectorsize)) { + test_err("got the wrong max_extent size %llu expected %llu", + max_extent_size, (unsigned long long)(2 * sectorsize)); + return -EINVAL; + } + + /* + * The search should have re-arranged the bytes index to use the + * ->max_extent_size, validate it's now what we expect it to be. + */ + entry = rb_entry(rb_first_cached(&ctl->free_space_bytes), + struct btrfs_free_space, bytes_index); + if (entry->bytes != (2 * sectorsize)) { + test_err("error, the bytes index wasn't recalculated properly"); + return -EINVAL; + } + + /* Add another sectorsize to re-arrange the tree back to ->bytes. */ + offset = (BITS_PER_BITMAP * sectorsize) - sectorsize; + ret = btrfs_add_free_space(cache, offset, sectorsize); + if (ret) { + test_err("error adding extent to the sparse entry %d", ret); + return ret; + } + + entry = rb_entry(rb_first_cached(&ctl->free_space_bytes), + struct btrfs_free_space, bytes_index); + if (entry->bytes != (11 * sectorsize)) { + test_err("error, wrong entry in the first slot in bytes_index"); + return -EINVAL; + } + + /* + * Now make sure we find our correct entry after searching that will + * result in a re-arranging of the tree. + */ + max_extent_size = 0; + offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 2, + 0, &max_extent_size); + if (offset != (BITS_PER_BITMAP * sectorsize)) { + test_err("error, found %llu instead of %llu for our alloc", + offset, + (unsigned long long)(BITS_PER_BITMAP * sectorsize)); + return -EINVAL; + } + + cache->free_space_ctl->op = orig_free_space_ops; + btrfs_remove_free_space_cache(cache); + return 0; +} + +int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_block_group *cache; + struct btrfs_root *root = NULL; + int ret = -ENOMEM; + + test_msg("running btrfs free space cache tests"); + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + /* + * For ppc64 (with 64k page size), bytes per bitmap might be + * larger than 1G. To make bitmap test available in ppc64, + * alloc dummy block group whose size cross bitmaps. + */ + cache = btrfs_alloc_dummy_block_group(fs_info, + BITS_PER_BITMAP * sectorsize + PAGE_SIZE); + if (!cache) { + test_std_err(TEST_ALLOC_BLOCK_GROUP); + btrfs_free_dummy_fs_info(fs_info); + return 0; + } + + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(root); + goto out; + } + + root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + btrfs_global_root_insert(root); + + ret = test_extents(cache); + if (ret) + goto out; + ret = test_bitmaps(cache, sectorsize); + if (ret) + goto out; + ret = test_bitmaps_and_extents(cache, sectorsize); + if (ret) + goto out; + + ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize); + if (ret) + goto out; + ret = test_bytes_index(cache, sectorsize); +out: + btrfs_free_dummy_block_group(cache); + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); + return ret; +} diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c new file mode 100644 index 0000000000..b61972046f --- /dev/null +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -0,0 +1,589 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2015 Facebook. All rights reserved. + */ + +#include +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../disk-io.h" +#include "../free-space-tree.h" +#include "../transaction.h" +#include "../block-group.h" +#include "../accessors.h" + +struct free_space_extent { + u64 start; + u64 length; +}; + +static int __check_free_space_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group *cache, + struct btrfs_path *path, + const struct free_space_extent * const extents, + unsigned int num_extents) +{ + struct btrfs_free_space_info *info; + struct btrfs_key key; + int prev_bit = 0, bit; + u64 extent_start = 0, offset, end; + u32 flags, extent_count; + unsigned int i; + int ret; + + info = search_free_space_info(trans, cache, path, 0); + if (IS_ERR(info)) { + test_err("could not find free space info"); + ret = PTR_ERR(info); + goto out; + } + flags = btrfs_free_space_flags(path->nodes[0], info); + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); + + if (extent_count != num_extents) { + test_err("extent count is wrong"); + ret = -EINVAL; + goto out; + } + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + if (path->slots[0] != 0) + goto invalid; + end = cache->start + cache->length; + i = 0; + while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.type != BTRFS_FREE_SPACE_BITMAP_KEY) + goto invalid; + offset = key.objectid; + while (offset < key.objectid + key.offset) { + bit = free_space_test_bit(cache, path, offset); + if (prev_bit == 0 && bit == 1) { + extent_start = offset; + } else if (prev_bit == 1 && bit == 0) { + if (i >= num_extents || + extent_start != extents[i].start || + offset - extent_start != extents[i].length) + goto invalid; + i++; + } + prev_bit = bit; + offset += fs_info->sectorsize; + } + } + if (prev_bit == 1) { + if (i >= num_extents || + extent_start != extents[i].start || + end - extent_start != extents[i].length) + goto invalid; + i++; + } + if (i != num_extents) + goto invalid; + } else { + if (btrfs_header_nritems(path->nodes[0]) != num_extents + 1 || + path->slots[0] != 0) + goto invalid; + for (i = 0; i < num_extents; i++) { + path->slots[0]++; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY || + key.objectid != extents[i].start || + key.offset != extents[i].length) + goto invalid; + } + } + + ret = 0; +out: + btrfs_release_path(path); + return ret; +invalid: + test_err("free space tree is invalid"); + ret = -EINVAL; + goto out; +} + +static int check_free_space_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group *cache, + struct btrfs_path *path, + const struct free_space_extent * const extents, + unsigned int num_extents) +{ + struct btrfs_free_space_info *info; + u32 flags; + int ret; + + info = search_free_space_info(trans, cache, path, 0); + if (IS_ERR(info)) { + test_err("could not find free space info"); + btrfs_release_path(path); + return PTR_ERR(info); + } + flags = btrfs_free_space_flags(path->nodes[0], info); + btrfs_release_path(path); + + ret = __check_free_space_extents(trans, fs_info, cache, path, extents, + num_extents); + if (ret) + return ret; + + /* Flip it to the other format and check that for good measure. */ + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + ret = convert_free_space_to_extents(trans, cache, path); + if (ret) { + test_err("could not convert to extents"); + return ret; + } + } else { + ret = convert_free_space_to_bitmaps(trans, cache, path); + if (ret) { + test_err("could not convert to bitmaps"); + return ret; + } + } + return __check_free_space_extents(trans, fs_info, cache, path, extents, + num_extents); +} + +static int test_empty_block_group(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group *cache, + struct btrfs_path *path, + u32 alignment) +{ + const struct free_space_extent extents[] = { + {cache->start, cache->length}, + }; + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_remove_all(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group *cache, + struct btrfs_path *path, + u32 alignment) +{ + const struct free_space_extent extents[] = {}; + int ret; + + ret = __remove_from_free_space_tree(trans, cache, path, + cache->start, + cache->length); + if (ret) { + test_err("could not remove free space"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_remove_beginning(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group *cache, + struct btrfs_path *path, + u32 alignment) +{ + const struct free_space_extent extents[] = { + {cache->start + alignment, cache->length - alignment}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, cache, path, + cache->start, alignment); + if (ret) { + test_err("could not remove free space"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); + +} + +static int test_remove_end(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group *cache, + struct btrfs_path *path, + u32 alignment) +{ + const struct free_space_extent extents[] = { + {cache->start, cache->length - alignment}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, cache, path, + cache->start + cache->length - alignment, + alignment); + if (ret) { + test_err("could not remove free space"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_remove_middle(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group *cache, + struct btrfs_path *path, + u32 alignment) +{ + const struct free_space_extent extents[] = { + {cache->start, alignment}, + {cache->start + 2 * alignment, cache->length - 2 * alignment}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, cache, path, + cache->start + alignment, + alignment); + if (ret) { + test_err("could not remove free space"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_left(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group *cache, + struct btrfs_path *path, + u32 alignment) +{ + const struct free_space_extent extents[] = { + {cache->start, 2 * alignment}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); + if (ret) { + test_err("could not remove free space"); + return ret; + } + + ret = __add_to_free_space_tree(trans, cache, path, cache->start, + alignment); + if (ret) { + test_err("could not add free space"); + return ret; + } + + ret = __add_to_free_space_tree(trans, cache, path, + cache->start + alignment, + alignment); + if (ret) { + test_err("could not add free space"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_right(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group *cache, + struct btrfs_path *path, + u32 alignment) +{ + const struct free_space_extent extents[] = { + {cache->start + alignment, 2 * alignment}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); + if (ret) { + test_err("could not remove free space"); + return ret; + } + + ret = __add_to_free_space_tree(trans, cache, path, + cache->start + 2 * alignment, + alignment); + if (ret) { + test_err("could not add free space"); + return ret; + } + + ret = __add_to_free_space_tree(trans, cache, path, + cache->start + alignment, + alignment); + if (ret) { + test_err("could not add free space"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_both(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group *cache, + struct btrfs_path *path, + u32 alignment) +{ + const struct free_space_extent extents[] = { + {cache->start, 3 * alignment}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); + if (ret) { + test_err("could not remove free space"); + return ret; + } + + ret = __add_to_free_space_tree(trans, cache, path, cache->start, + alignment); + if (ret) { + test_err("could not add free space"); + return ret; + } + + ret = __add_to_free_space_tree(trans, cache, path, + cache->start + 2 * alignment, alignment); + if (ret) { + test_err("could not add free space"); + return ret; + } + + ret = __add_to_free_space_tree(trans, cache, path, + cache->start + alignment, alignment); + if (ret) { + test_err("could not add free space"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_none(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group *cache, + struct btrfs_path *path, + u32 alignment) +{ + const struct free_space_extent extents[] = { + {cache->start, alignment}, + {cache->start + 2 * alignment, alignment}, + {cache->start + 4 * alignment, alignment}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); + if (ret) { + test_err("could not remove free space"); + return ret; + } + + ret = __add_to_free_space_tree(trans, cache, path, cache->start, + alignment); + if (ret) { + test_err("could not add free space"); + return ret; + } + + ret = __add_to_free_space_tree(trans, cache, path, + cache->start + 4 * alignment, alignment); + if (ret) { + test_err("could not add free space"); + return ret; + } + + ret = __add_to_free_space_tree(trans, cache, path, + cache->start + 2 * alignment, alignment); + if (ret) { + test_err("could not add free space"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +typedef int (*test_func_t)(struct btrfs_trans_handle *, + struct btrfs_fs_info *, + struct btrfs_block_group *, + struct btrfs_path *, + u32 alignment); + +static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, + u32 nodesize, u32 alignment) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_root *root = NULL; + struct btrfs_block_group *cache = NULL; + struct btrfs_trans_handle trans; + struct btrfs_path *path = NULL; + int ret; + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + ret = -ENOMEM; + goto out; + } + + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(root); + goto out; + } + + btrfs_set_super_compat_ro_flags(root->fs_info->super_copy, + BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE); + root->root_key.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + btrfs_global_root_insert(root); + root->fs_info->tree_root = root; + + root->node = alloc_test_extent_buffer(root->fs_info, nodesize); + if (IS_ERR(root->node)) { + test_std_err(TEST_ALLOC_EXTENT_BUFFER); + ret = PTR_ERR(root->node); + goto out; + } + btrfs_set_header_level(root->node, 0); + btrfs_set_header_nritems(root->node, 0); + root->alloc_bytenr += 2 * nodesize; + + cache = btrfs_alloc_dummy_block_group(fs_info, 8 * alignment); + if (!cache) { + test_std_err(TEST_ALLOC_BLOCK_GROUP); + ret = -ENOMEM; + goto out; + } + cache->bitmap_low_thresh = 0; + cache->bitmap_high_thresh = (u32)-1; + set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags); + cache->fs_info = root->fs_info; + + btrfs_init_dummy_trans(&trans, root->fs_info); + + path = btrfs_alloc_path(); + if (!path) { + test_std_err(TEST_ALLOC_ROOT); + ret = -ENOMEM; + goto out; + } + + ret = add_block_group_free_space(&trans, cache); + if (ret) { + test_err("could not add block group free space"); + goto out; + } + + if (bitmaps) { + ret = convert_free_space_to_bitmaps(&trans, cache, path); + if (ret) { + test_err("could not convert block group to bitmaps"); + goto out; + } + } + + ret = test_func(&trans, root->fs_info, cache, path, alignment); + if (ret) + goto out; + + ret = remove_block_group_free_space(&trans, cache); + if (ret) { + test_err("could not remove block group free space"); + goto out; + } + + if (btrfs_header_nritems(root->node) != 0) { + test_err("free space tree has leftover items"); + ret = -EINVAL; + goto out; + } + + ret = 0; +out: + btrfs_free_path(path); + btrfs_free_dummy_block_group(cache); + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); + return ret; +} + +static int run_test_both_formats(test_func_t test_func, u32 sectorsize, + u32 nodesize, u32 alignment) +{ + int test_ret = 0; + int ret; + + ret = run_test(test_func, 0, sectorsize, nodesize, alignment); + if (ret) { + test_err( + "%ps failed with extents, sectorsize=%u, nodesize=%u, alignment=%u", + test_func, sectorsize, nodesize, alignment); + test_ret = ret; + } + + ret = run_test(test_func, 1, sectorsize, nodesize, alignment); + if (ret) { + test_err( + "%ps failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u", + test_func, sectorsize, nodesize, alignment); + test_ret = ret; + } + + return test_ret; +} + +int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) +{ + test_func_t tests[] = { + test_empty_block_group, + test_remove_all, + test_remove_beginning, + test_remove_end, + test_remove_middle, + test_merge_left, + test_merge_right, + test_merge_both, + test_merge_none, + }; + u32 bitmap_alignment; + int test_ret = 0; + int i; + + /* + * Align some operations to a page to flush out bugs in the extent + * buffer bitmap handling of highmem. + */ + bitmap_alignment = BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE; + + test_msg("running free space tree tests"); + for (i = 0; i < ARRAY_SIZE(tests); i++) { + int ret; + + ret = run_test_both_formats(tests[i], sectorsize, nodesize, + sectorsize); + if (ret) + test_ret = ret; + + ret = run_test_both_formats(tests[i], sectorsize, nodesize, + bitmap_alignment); + if (ret) + test_ret = ret; + } + + return test_ret; +} diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c new file mode 100644 index 0000000000..492d69d2fa --- /dev/null +++ b/fs/btrfs/tests/inode-tests.c @@ -0,0 +1,1108 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + */ + +#include +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../btrfs_inode.h" +#include "../disk-io.h" +#include "../extent_io.h" +#include "../volumes.h" +#include "../compression.h" +#include "../accessors.h" + +static void insert_extent(struct btrfs_root *root, u64 start, u64 len, + u64 ram_bytes, u64 offset, u64 disk_bytenr, + u64 disk_len, u32 type, u8 compression, int slot) +{ + struct btrfs_path path; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf = root->node; + struct btrfs_key key; + u32 value_len = sizeof(struct btrfs_file_extent_item); + + if (type == BTRFS_FILE_EXTENT_INLINE) + value_len += len; + memset(&path, 0, sizeof(path)); + + path.nodes[0] = leaf; + path.slots[0] = slot; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + /* + * Passing a NULL trans handle is fine here, we have a dummy root eb + * and the tree is a single node (level 0). + */ + btrfs_setup_item_for_insert(NULL, root, &path, &key, value_len); + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, 1); + btrfs_set_file_extent_type(leaf, fi, type); + btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_len); + btrfs_set_file_extent_offset(leaf, fi, offset); + btrfs_set_file_extent_num_bytes(leaf, fi, len); + btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); + btrfs_set_file_extent_compression(leaf, fi, compression); + btrfs_set_file_extent_encryption(leaf, fi, 0); + btrfs_set_file_extent_other_encoding(leaf, fi, 0); +} + +static void insert_inode_item_key(struct btrfs_root *root) +{ + struct btrfs_path path; + struct extent_buffer *leaf = root->node; + struct btrfs_key key; + u32 value_len = 0; + + memset(&path, 0, sizeof(path)); + + path.nodes[0] = leaf; + path.slots[0] = 0; + + key.objectid = BTRFS_INODE_ITEM_KEY; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + /* + * Passing a NULL trans handle is fine here, we have a dummy root eb + * and the tree is a single node (level 0). + */ + btrfs_setup_item_for_insert(NULL, root, &path, &key, value_len); +} + +/* + * Build the most complicated map of extents the earth has ever seen. We want + * this so we can test all of the corner cases of btrfs_get_extent. Here is a + * diagram of how the extents will look though this may not be possible we still + * want to make sure everything acts normally (the last number is not inclusive) + * + * [0 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291] + * [inline][hole but no extent][ hole ][ regular ][regular1 split] + * + * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ] + * [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written] + * + * [36867 - 45059][45059 - 53251][53251 - 57347][57347 - 61443][61443- 69635] + * [ prealloc1 ][ compressed ][ compressed1 ][ regular ][ compressed1] + * + * [69635-73731][ 73731 - 86019 ][86019-90115] + * [ regular ][ hole but no extent][ regular ] + */ +static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) +{ + int slot = 0; + u64 disk_bytenr = SZ_1M; + u64 offset = 0; + + /* + * Tree-checker has strict limits on inline extents that they can only + * exist at file offset 0, thus we can only have one inline file extent + * at most. + */ + insert_extent(root, offset, 6, 6, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, + slot); + slot++; + offset = sectorsize; + + /* Now another hole */ + insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, + slot); + slot++; + offset += 4; + + /* Now for a regular extent */ + insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0, + disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + disk_bytenr += sectorsize; + offset += sectorsize - 1; + + /* + * Now for 3 extents that were split from a hole punch so we test + * offsets properly. + */ + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, 0, 0, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, 4 * sectorsize, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 2 * sectorsize; + disk_bytenr += 4 * sectorsize; + + /* Now for a unwritten prealloc extent */ + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + slot++; + offset += sectorsize; + + /* + * We want to jack up disk_bytenr a little more so the em stuff doesn't + * merge our records. + */ + disk_bytenr += 2 * sectorsize; + + /* + * Now for a partially written prealloc extent, basically the same as + * the hole punch example above. Ram_bytes never changes when you mark + * extents written btw. + */ + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + slot++; + offset += sectorsize; + insert_extent(root, offset, sectorsize, 4 * sectorsize, sectorsize, + disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, + slot); + slot++; + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, 4 * sectorsize, + BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + slot++; + offset += 2 * sectorsize; + disk_bytenr += 4 * sectorsize; + + /* Now a normal compressed extent */ + insert_extent(root, offset, 2 * sectorsize, 2 * sectorsize, 0, + disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, + BTRFS_COMPRESS_ZLIB, slot); + slot++; + offset += 2 * sectorsize; + /* No merges */ + disk_bytenr += 2 * sectorsize; + + /* Now a split compressed extent */ + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, + BTRFS_COMPRESS_ZLIB, slot); + slot++; + offset += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, + disk_bytenr + sectorsize, sectorsize, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, sectorsize, + BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + slot++; + offset += 2 * sectorsize; + disk_bytenr += 2 * sectorsize; + + /* Now extents that have a hole but no hole extent */ + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 4 * sectorsize; + disk_bytenr += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); +} + +static unsigned long prealloc_only = 0; +static unsigned long compressed_only = 0; +static unsigned long vacancy_only = 0; + +static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) +{ + struct btrfs_fs_info *fs_info = NULL; + struct inode *inode = NULL; + struct btrfs_root *root = NULL; + struct extent_map *em = NULL; + u64 orig_start; + u64 disk_bytenr; + u64 offset; + int ret = -ENOMEM; + + test_msg("running btrfs_get_extent tests"); + + inode = btrfs_new_test_inode(); + if (!inode) { + test_std_err(TEST_ALLOC_INODE); + return ret; + } + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + goto out; + } + + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + goto out; + } + + root->node = alloc_dummy_extent_buffer(fs_info, nodesize); + if (!root->node) { + test_std_err(TEST_ALLOC_ROOT); + goto out; + } + + btrfs_set_header_nritems(root->node, 0); + btrfs_set_header_level(root->node, 0); + ret = -EINVAL; + + /* First with no extents */ + BTRFS_I(inode)->root = root; + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize); + if (IS_ERR(em)) { + em = NULL; + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_err("expected a hole, got %llu", em->block_start); + goto out; + } + free_extent_map(em); + btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); + + /* + * All of the magic numbers are based on the mapping setup in + * setup_file_extents, so if you change anything there you need to + * update the comment and update the expected values below. + */ + setup_file_extents(root, sectorsize); + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start != EXTENT_MAP_INLINE) { + test_err("expected an inline, got %llu", em->block_start); + goto out; + } + + /* + * For inline extent, we always round up the em to sectorsize, as + * they are either: + * + * a) a hidden hole + * The range will be zeroed at inline extent read time. + * + * b) a file extent with unaligned bytenr + * Tree checker will reject it. + */ + if (em->start != 0 || em->len != sectorsize) { + test_err( + "unexpected extent wanted start 0 len %u, got start %llu len %llu", + sectorsize, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_err("unexpected flags set, want 0 have %lu", em->flags); + goto out; + } + /* + * We don't test anything else for inline since it doesn't get set + * unless we have a page for it to write into. Maybe we should change + * this? + */ + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_err("expected a hole, got %llu", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4) { + test_err( + "unexpected extent wanted start %llu len 4, got start %llu len %llu", + offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_err("unexpected flags set, want 0 have %lu", em->flags); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Regular extent */ + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->block_start); + goto out; + } + if (em->start != offset || em->len != sectorsize - 1) { + test_err( + "unexpected extent wanted start %llu len 4095, got start %llu len %llu", + offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_err("unexpected flags set, want 0 have %lu", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_err("wrong orig offset, want %llu, have %llu", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* The next 3 are split extents */ + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->block_start); + goto out; + } + if (em->start != offset || em->len != sectorsize) { + test_err( + "unexpected extent start %llu len %u, got start %llu len %llu", + offset, sectorsize, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_err("unexpected flags set, want 0 have %lu", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_err("wrong orig offset, want %llu, have %llu", em->start, + em->orig_start); + goto out; + } + disk_bytenr = em->block_start; + orig_start = em->start; + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_err("expected a hole, got %llu", em->block_start); + goto out; + } + if (em->start != offset || em->len != sectorsize) { + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", + offset, sectorsize, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_err("unexpected flags set, want 0 have %lu", em->flags); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->block_start); + goto out; + } + if (em->start != offset || em->len != 2 * sectorsize) { + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", + offset, 2 * sectorsize, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_err("unexpected flags set, want 0 have %lu", em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_err("wrong orig offset, want %llu, have %llu", + orig_start, em->orig_start); + goto out; + } + disk_bytenr += (em->start - orig_start); + if (em->block_start != disk_bytenr) { + test_err("wrong block start, want %llu, have %llu", + disk_bytenr, em->block_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Prealloc extent */ + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->block_start); + goto out; + } + if (em->start != offset || em->len != sectorsize) { + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", + offset, sectorsize, em->start, em->len); + goto out; + } + if (em->flags != prealloc_only) { + test_err("unexpected flags set, want %lu have %lu", + prealloc_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_err("wrong orig offset, want %llu, have %llu", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* The next 3 are a half written prealloc extent */ + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->block_start); + goto out; + } + if (em->start != offset || em->len != sectorsize) { + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", + offset, sectorsize, em->start, em->len); + goto out; + } + if (em->flags != prealloc_only) { + test_err("unexpected flags set, want %lu have %lu", + prealloc_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_err("wrong orig offset, want %llu, have %llu", em->start, + em->orig_start); + goto out; + } + disk_bytenr = em->block_start; + orig_start = em->start; + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start >= EXTENT_MAP_HOLE) { + test_err("expected a real extent, got %llu", em->block_start); + goto out; + } + if (em->start != offset || em->len != sectorsize) { + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", + offset, sectorsize, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_err("unexpected flags set, want 0 have %lu", em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_err("unexpected orig offset, wanted %llu, have %llu", + orig_start, em->orig_start); + goto out; + } + if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { + test_err("unexpected block start, wanted %llu, have %llu", + disk_bytenr + (em->start - em->orig_start), + em->block_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->block_start); + goto out; + } + if (em->start != offset || em->len != 2 * sectorsize) { + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", + offset, 2 * sectorsize, em->start, em->len); + goto out; + } + if (em->flags != prealloc_only) { + test_err("unexpected flags set, want %lu have %lu", + prealloc_only, em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_err("wrong orig offset, want %llu, have %llu", orig_start, + em->orig_start); + goto out; + } + if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { + test_err("unexpected block start, wanted %llu, have %llu", + disk_bytenr + (em->start - em->orig_start), + em->block_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Now for the compressed extent */ + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->block_start); + goto out; + } + if (em->start != offset || em->len != 2 * sectorsize) { + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", + offset, 2 * sectorsize, em->start, em->len); + goto out; + } + if (em->flags != compressed_only) { + test_err("unexpected flags set, want %lu have %lu", + compressed_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_err("wrong orig offset, want %llu, have %llu", + em->start, em->orig_start); + goto out; + } + if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + test_err("unexpected compress type, wanted %d, got %d", + BTRFS_COMPRESS_ZLIB, em->compress_type); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Split compressed extent */ + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->block_start); + goto out; + } + if (em->start != offset || em->len != sectorsize) { + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", + offset, sectorsize, em->start, em->len); + goto out; + } + if (em->flags != compressed_only) { + test_err("unexpected flags set, want %lu have %lu", + compressed_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_err("wrong orig offset, want %llu, have %llu", + em->start, em->orig_start); + goto out; + } + if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + test_err("unexpected compress type, wanted %d, got %d", + BTRFS_COMPRESS_ZLIB, em->compress_type); + goto out; + } + disk_bytenr = em->block_start; + orig_start = em->start; + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->block_start); + goto out; + } + if (em->start != offset || em->len != sectorsize) { + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", + offset, sectorsize, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_err("unexpected flags set, want 0 have %lu", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_err("wrong orig offset, want %llu, have %llu", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start != disk_bytenr) { + test_err("block start does not match, want %llu got %llu", + disk_bytenr, em->block_start); + goto out; + } + if (em->start != offset || em->len != 2 * sectorsize) { + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", + offset, 2 * sectorsize, em->start, em->len); + goto out; + } + if (em->flags != compressed_only) { + test_err("unexpected flags set, want %lu have %lu", + compressed_only, em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_err("wrong orig offset, want %llu, have %llu", + em->start, orig_start); + goto out; + } + if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + test_err("unexpected compress type, wanted %d, got %d", + BTRFS_COMPRESS_ZLIB, em->compress_type); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* A hole between regular extents but no hole extent */ + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->block_start); + goto out; + } + if (em->start != offset || em->len != sectorsize) { + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", + offset, sectorsize, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_err("unexpected flags set, want 0 have %lu", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_err("wrong orig offset, want %llu, have %llu", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_err("expected a hole extent, got %llu", em->block_start); + goto out; + } + /* + * Currently we just return a length that we requested rather than the + * length of the actual hole, if this changes we'll have to change this + * test. + */ + if (em->start != offset || em->len != 3 * sectorsize) { + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", + offset, 3 * sectorsize, em->start, em->len); + goto out; + } + if (em->flags != vacancy_only) { + test_err("unexpected flags set, want %lu have %lu", + vacancy_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_err("wrong orig offset, want %llu, have %llu", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->block_start); + goto out; + } + if (em->start != offset || em->len != sectorsize) { + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", + offset, sectorsize, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_err("unexpected flags set, want 0 have %lu", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_err("wrong orig offset, want %llu, have %llu", em->start, + em->orig_start); + goto out; + } + ret = 0; +out: + if (!IS_ERR(em)) + free_extent_map(em); + iput(inode); + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); + return ret; +} + +static int test_hole_first(u32 sectorsize, u32 nodesize) +{ + struct btrfs_fs_info *fs_info = NULL; + struct inode *inode = NULL; + struct btrfs_root *root = NULL; + struct extent_map *em = NULL; + int ret = -ENOMEM; + + test_msg("running hole first btrfs_get_extent test"); + + inode = btrfs_new_test_inode(); + if (!inode) { + test_std_err(TEST_ALLOC_INODE); + return ret; + } + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + goto out; + } + + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + goto out; + } + + root->node = alloc_dummy_extent_buffer(fs_info, nodesize); + if (!root->node) { + test_std_err(TEST_ALLOC_ROOT); + goto out; + } + + btrfs_set_header_nritems(root->node, 0); + btrfs_set_header_level(root->node, 0); + BTRFS_I(inode)->root = root; + ret = -EINVAL; + + /* + * Need a blank inode item here just so we don't confuse + * btrfs_get_extent. + */ + insert_inode_item_key(root); + insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_err("expected a hole, got %llu", em->block_start); + goto out; + } + if (em->start != 0 || em->len != sectorsize) { + test_err( + "unexpected extent wanted start 0 len %u, got start %llu len %llu", + sectorsize, em->start, em->len); + goto out; + } + if (em->flags != vacancy_only) { + test_err("wrong flags, wanted %lu, have %lu", vacancy_only, + em->flags); + goto out; + } + free_extent_map(em); + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize); + if (IS_ERR(em)) { + test_err("got an error when we shouldn't have"); + goto out; + } + if (em->block_start != sectorsize) { + test_err("expected a real extent, got %llu", em->block_start); + goto out; + } + if (em->start != sectorsize || em->len != sectorsize) { + test_err( + "unexpected extent wanted start %u len %u, got start %llu len %llu", + sectorsize, sectorsize, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_err("unexpected flags set, wanted 0 got %lu", + em->flags); + goto out; + } + ret = 0; +out: + if (!IS_ERR(em)) + free_extent_map(em); + iput(inode); + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); + return ret; +} + +static int test_extent_accounting(u32 sectorsize, u32 nodesize) +{ + struct btrfs_fs_info *fs_info = NULL; + struct inode *inode = NULL; + struct btrfs_root *root = NULL; + int ret = -ENOMEM; + + test_msg("running outstanding_extents tests"); + + inode = btrfs_new_test_inode(); + if (!inode) { + test_std_err(TEST_ALLOC_INODE); + return ret; + } + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + goto out; + } + + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + goto out; + } + + BTRFS_I(inode)->root = root; + + /* [BTRFS_MAX_EXTENT_SIZE] */ + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0, + BTRFS_MAX_EXTENT_SIZE - 1, 0, NULL); + if (ret) { + test_err("btrfs_set_extent_delalloc returned %d", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 1) { + ret = -EINVAL; + test_err("miscount, wanted 1, got %u", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE, + BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, + 0, NULL); + if (ret) { + test_err("btrfs_set_extent_delalloc returned %d", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_err("miscount, wanted 2, got %u", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_UPTODATE, NULL); + if (ret) { + test_err("clear_extent_bit returned %d", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_err("miscount, wanted 2, got %u", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + + sectorsize - 1, + 0, NULL); + if (ret) { + test_err("btrfs_set_extent_delalloc returned %d", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_err("miscount, wanted 2, got %u", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* + * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize] + */ + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, + (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, + 0, NULL); + if (ret) { + test_err("btrfs_set_extent_delalloc returned %d", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 4) { + ret = -EINVAL; + test_err("miscount, wanted 4, got %u", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* + * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize] + */ + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL); + if (ret) { + test_err("btrfs_set_extent_delalloc returned %d", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 3) { + ret = -EINVAL; + test_err("miscount, wanted 3, got %u", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_UPTODATE, NULL); + if (ret) { + test_err("clear_extent_bit returned %d", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 4) { + ret = -EINVAL; + test_err("miscount, wanted 4, got %u", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* + * Refill the hole again just for good measure, because I thought it + * might fail and I'd rather satisfy my paranoia at this point. + */ + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL); + if (ret) { + test_err("btrfs_set_extent_delalloc returned %d", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 3) { + ret = -EINVAL; + test_err("miscount, wanted 3, got %u", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* Empty */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_UPTODATE, NULL); + if (ret) { + test_err("clear_extent_bit returned %d", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents) { + ret = -EINVAL; + test_err("miscount, wanted 0, got %u", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + ret = 0; +out: + if (ret) + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_UPTODATE, NULL); + iput(inode); + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); + return ret; +} + +int btrfs_test_inodes(u32 sectorsize, u32 nodesize) +{ + int ret; + + test_msg("running inode tests"); + + set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only); + set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); + + ret = test_btrfs_get_extent(sectorsize, nodesize); + if (ret) + return ret; + ret = test_hole_first(sectorsize, nodesize); + if (ret) + return ret; + return test_extent_accounting(sectorsize, nodesize); +} diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c new file mode 100644 index 0000000000..3fc8dc3fd9 --- /dev/null +++ b/fs/btrfs/tests/qgroup-tests.c @@ -0,0 +1,559 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2013 Facebook. All rights reserved. + */ + +#include +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../transaction.h" +#include "../disk-io.h" +#include "../qgroup.h" +#include "../backref.h" +#include "../fs.h" +#include "../accessors.h" + +static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_extent_inline_ref *iref; + struct btrfs_tree_block_info *block_info; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key ins; + u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info); + int ret; + + btrfs_init_dummy_trans(&trans, NULL); + + ins.objectid = bytenr; + ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_std_err(TEST_ALLOC_ROOT); + return -ENOMEM; + } + + ret = btrfs_insert_empty_item(&trans, root, path, &ins, size); + if (ret) { + test_err("couldn't insert ref %d", ret); + btrfs_free_path(path); + return ret; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, item, 1); + btrfs_set_extent_generation(leaf, item, 1); + btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK); + block_info = (struct btrfs_tree_block_info *)(item + 1); + btrfs_set_tree_block_level(leaf, block_info, 0); + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + if (parent > 0) { + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_SHARED_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + btrfs_free_path(path); + return 0; +} + +static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_path *path; + struct btrfs_key key; + u64 refs; + int ret; + + btrfs_init_dummy_trans(&trans, NULL); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_std_err(TEST_ALLOC_ROOT); + return -ENOMEM; + } + + ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); + if (ret) { + test_err("couldn't find extent ref"); + btrfs_free_path(path); + return ret; + } + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + refs = btrfs_extent_refs(path->nodes[0], item); + btrfs_set_extent_refs(path->nodes[0], item, refs + 1); + btrfs_release_path(path); + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_insert_empty_item(&trans, root, path, &key, 0); + if (ret) + test_err("failed to insert backref"); + btrfs_free_path(path); + return ret; +} + +static int remove_extent_item(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_trans_handle trans; + struct btrfs_key key; + struct btrfs_path *path; + int ret; + + btrfs_init_dummy_trans(&trans, NULL); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_std_err(TEST_ALLOC_ROOT); + return -ENOMEM; + } + + ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); + if (ret) { + test_err("didn't find our key %d", ret); + btrfs_free_path(path); + return ret; + } + btrfs_del_item(&trans, root, path); + btrfs_free_path(path); + return 0; +} + +static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_path *path; + struct btrfs_key key; + u64 refs; + int ret; + + btrfs_init_dummy_trans(&trans, NULL); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_std_err(TEST_ALLOC_ROOT); + return -ENOMEM; + } + + ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); + if (ret) { + test_err("couldn't find extent ref"); + btrfs_free_path(path); + return ret; + } + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + refs = btrfs_extent_refs(path->nodes[0], item); + btrfs_set_extent_refs(path->nodes[0], item, refs - 1); + btrfs_release_path(path); + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); + if (ret) { + test_err("couldn't find backref %d", ret); + btrfs_free_path(path); + return ret; + } + btrfs_del_item(&trans, root, path); + btrfs_free_path(path); + return ret; +} + +static int test_no_shared_qgroup(struct btrfs_root *root, + u32 sectorsize, u32 nodesize) +{ + struct btrfs_backref_walk_ctx ctx = { 0 }; + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist *old_roots = NULL; + struct ulist *new_roots = NULL; + int ret; + + btrfs_init_dummy_trans(&trans, fs_info); + + test_msg("running qgroup add/remove tests"); + ret = btrfs_create_qgroup(&trans, BTRFS_FS_TREE_OBJECTID); + if (ret) { + test_err("couldn't create a qgroup %d", ret); + return ret; + } + + ctx.bytenr = nodesize; + ctx.trans = &trans; + ctx.fs_info = fs_info; + + /* + * Since the test trans doesn't have the complicated delayed refs, + * we can only call btrfs_qgroup_account_extent() directly to test + * quota. + */ + ret = btrfs_find_all_roots(&ctx, false); + if (ret) { + test_err("couldn't find old roots: %d", ret); + return ret; + } + old_roots = ctx.roots; + ctx.roots = NULL; + + ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FS_TREE_OBJECTID); + if (ret) { + ulist_free(old_roots); + return ret; + } + + ret = btrfs_find_all_roots(&ctx, false); + if (ret) { + ulist_free(old_roots); + test_err("couldn't find old roots: %d", ret); + return ret; + } + new_roots = ctx.roots; + ctx.roots = NULL; + + ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, + new_roots); + if (ret) { + test_err("couldn't account space for a qgroup %d", ret); + return ret; + } + + /* btrfs_qgroup_account_extent() always frees the ulists passed to it. */ + old_roots = NULL; + new_roots = NULL; + + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { + test_err("qgroup counts didn't match expected values"); + return -EINVAL; + } + + ret = btrfs_find_all_roots(&ctx, false); + if (ret) { + test_err("couldn't find old roots: %d", ret); + return ret; + } + old_roots = ctx.roots; + ctx.roots = NULL; + + ret = remove_extent_item(root, nodesize, nodesize); + if (ret) { + ulist_free(old_roots); + return -EINVAL; + } + + ret = btrfs_find_all_roots(&ctx, false); + if (ret) { + ulist_free(old_roots); + test_err("couldn't find old roots: %d", ret); + return ret; + } + new_roots = ctx.roots; + ctx.roots = NULL; + + ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, + new_roots); + if (ret) { + test_err("couldn't account space for a qgroup %d", ret); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, 0, 0)) { + test_err("qgroup counts didn't match expected values"); + return -EINVAL; + } + + return 0; +} + +/* + * Add a ref for two different roots to make sure the shared value comes out + * right, also remove one of the roots and make sure the exclusive count is + * adjusted properly. + */ +static int test_multiple_refs(struct btrfs_root *root, + u32 sectorsize, u32 nodesize) +{ + struct btrfs_backref_walk_ctx ctx = { 0 }; + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist *old_roots = NULL; + struct ulist *new_roots = NULL; + int ret; + + btrfs_init_dummy_trans(&trans, fs_info); + + test_msg("running qgroup multiple refs test"); + + /* + * We have BTRFS_FS_TREE_OBJECTID created already from the + * previous test. + */ + ret = btrfs_create_qgroup(&trans, BTRFS_FIRST_FREE_OBJECTID); + if (ret) { + test_err("couldn't create a qgroup %d", ret); + return ret; + } + + ctx.bytenr = nodesize; + ctx.trans = &trans; + ctx.fs_info = fs_info; + + ret = btrfs_find_all_roots(&ctx, false); + if (ret) { + test_err("couldn't find old roots: %d", ret); + return ret; + } + old_roots = ctx.roots; + ctx.roots = NULL; + + ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FS_TREE_OBJECTID); + if (ret) { + ulist_free(old_roots); + return ret; + } + + ret = btrfs_find_all_roots(&ctx, false); + if (ret) { + ulist_free(old_roots); + test_err("couldn't find old roots: %d", ret); + return ret; + } + new_roots = ctx.roots; + ctx.roots = NULL; + + ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, + new_roots); + if (ret) { + test_err("couldn't account space for a qgroup %d", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { + test_err("qgroup counts didn't match expected values"); + return -EINVAL; + } + + ret = btrfs_find_all_roots(&ctx, false); + if (ret) { + test_err("couldn't find old roots: %d", ret); + return ret; + } + old_roots = ctx.roots; + ctx.roots = NULL; + + ret = add_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FIRST_FREE_OBJECTID); + if (ret) { + ulist_free(old_roots); + return ret; + } + + ret = btrfs_find_all_roots(&ctx, false); + if (ret) { + ulist_free(old_roots); + test_err("couldn't find old roots: %d", ret); + return ret; + } + new_roots = ctx.roots; + ctx.roots = NULL; + + ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, + new_roots); + if (ret) { + test_err("couldn't account space for a qgroup %d", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, 0)) { + test_err("qgroup counts didn't match expected values"); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID, + nodesize, 0)) { + test_err("qgroup counts didn't match expected values"); + return -EINVAL; + } + + ret = btrfs_find_all_roots(&ctx, false); + if (ret) { + test_err("couldn't find old roots: %d", ret); + return ret; + } + old_roots = ctx.roots; + ctx.roots = NULL; + + ret = remove_extent_ref(root, nodesize, nodesize, 0, + BTRFS_FIRST_FREE_OBJECTID); + if (ret) { + ulist_free(old_roots); + return ret; + } + + ret = btrfs_find_all_roots(&ctx, false); + if (ret) { + ulist_free(old_roots); + test_err("couldn't find old roots: %d", ret); + return ret; + } + new_roots = ctx.roots; + ctx.roots = NULL; + + ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, + new_roots); + if (ret) { + test_err("couldn't account space for a qgroup %d", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID, + 0, 0)) { + test_err("qgroup counts didn't match expected values"); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { + test_err("qgroup counts didn't match expected values"); + return -EINVAL; + } + + return 0; +} + +int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) +{ + struct btrfs_fs_info *fs_info = NULL; + struct btrfs_root *root; + struct btrfs_root *tmp_root; + int ret = 0; + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(root); + goto out; + } + + /* We are using this root as our extent root */ + root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + btrfs_global_root_insert(root); + + /* + * Some of the paths we test assume we have a filled out fs_info, so we + * just need to add the root in there so we don't panic. + */ + root->fs_info->tree_root = root; + root->fs_info->quota_root = root; + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + + /* + * Can't use bytenr 0, some things freak out + * *cough*backref walking code*cough* + */ + root->node = alloc_test_extent_buffer(root->fs_info, nodesize); + if (IS_ERR(root->node)) { + test_err("couldn't allocate dummy buffer"); + ret = PTR_ERR(root->node); + goto out; + } + btrfs_set_header_level(root->node, 0); + btrfs_set_header_nritems(root->node, 0); + root->alloc_bytenr += 2 * nodesize; + + tmp_root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(tmp_root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(tmp_root); + goto out; + } + + tmp_root->root_key.objectid = BTRFS_FS_TREE_OBJECTID; + root->fs_info->fs_root = tmp_root; + ret = btrfs_insert_fs_root(root->fs_info, tmp_root); + if (ret) { + test_err("couldn't insert fs root %d", ret); + goto out; + } + btrfs_put_root(tmp_root); + + tmp_root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(tmp_root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(tmp_root); + goto out; + } + + tmp_root->root_key.objectid = BTRFS_FIRST_FREE_OBJECTID; + ret = btrfs_insert_fs_root(root->fs_info, tmp_root); + if (ret) { + test_err("couldn't insert fs root %d", ret); + goto out; + } + btrfs_put_root(tmp_root); + + test_msg("running qgroup tests"); + ret = test_no_shared_qgroup(root, sectorsize, nodesize); + if (ret) + goto out; + ret = test_multiple_refs(root, sectorsize, nodesize); +out: + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); + return ret; +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c new file mode 100644 index 0000000000..0ac2d191cd --- /dev/null +++ b/fs/btrfs/transaction.c @@ -0,0 +1,2682 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "misc.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "locking.h" +#include "tree-log.h" +#include "volumes.h" +#include "dev-replace.h" +#include "qgroup.h" +#include "block-group.h" +#include "space-info.h" +#include "zoned.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "defrag.h" +#include "dir-item.h" +#include "uuid-tree.h" +#include "ioctl.h" +#include "relocation.h" +#include "scrub.h" + +static struct kmem_cache *btrfs_trans_handle_cachep; + +/* + * Transaction states and transitions + * + * No running transaction (fs tree blocks are not modified) + * | + * | To next stage: + * | Call start_transaction() variants. Except btrfs_join_transaction_nostart(). + * V + * Transaction N [[TRANS_STATE_RUNNING]] + * | + * | New trans handles can be attached to transaction N by calling all + * | start_transaction() variants. + * | + * | To next stage: + * | Call btrfs_commit_transaction() on any trans handle attached to + * | transaction N + * V + * Transaction N [[TRANS_STATE_COMMIT_PREP]] + * | + * | If there are simultaneous calls to btrfs_commit_transaction() one will win + * | the race and the rest will wait for the winner to commit the transaction. + * | + * | The winner will wait for previous running transaction to completely finish + * | if there is one. + * | + * Transaction N [[TRANS_STATE_COMMIT_START]] + * | + * | Then one of the following happens: + * | - Wait for all other trans handle holders to release. + * | The btrfs_commit_transaction() caller will do the commit work. + * | - Wait for current transaction to be committed by others. + * | Other btrfs_commit_transaction() caller will do the commit work. + * | + * | At this stage, only btrfs_join_transaction*() variants can attach + * | to this running transaction. + * | All other variants will wait for current one to finish and attach to + * | transaction N+1. + * | + * | To next stage: + * | Caller is chosen to commit transaction N, and all other trans handle + * | haven been released. + * V + * Transaction N [[TRANS_STATE_COMMIT_DOING]] + * | + * | The heavy lifting transaction work is started. + * | From running delayed refs (modifying extent tree) to creating pending + * | snapshots, running qgroups. + * | In short, modify supporting trees to reflect modifications of subvolume + * | trees. + * | + * | At this stage, all start_transaction() calls will wait for this + * | transaction to finish and attach to transaction N+1. + * | + * | To next stage: + * | Until all supporting trees are updated. + * V + * Transaction N [[TRANS_STATE_UNBLOCKED]] + * | Transaction N+1 + * | All needed trees are modified, thus we only [[TRANS_STATE_RUNNING]] + * | need to write them back to disk and update | + * | super blocks. | + * | | + * | At this stage, new transaction is allowed to | + * | start. | + * | All new start_transaction() calls will be | + * | attached to transid N+1. | + * | | + * | To next stage: | + * | Until all tree blocks are super blocks are | + * | written to block devices | + * V | + * Transaction N [[TRANS_STATE_COMPLETED]] V + * All tree blocks and super blocks are written. Transaction N+1 + * This transaction is finished and all its [[TRANS_STATE_COMMIT_START]] + * data structures will be cleaned up. | Life goes on + */ +static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { + [TRANS_STATE_RUNNING] = 0U, + [TRANS_STATE_COMMIT_PREP] = 0U, + [TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH), + [TRANS_STATE_COMMIT_DOING] = (__TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOSTART), + [TRANS_STATE_UNBLOCKED] = (__TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK | + __TRANS_JOIN_NOSTART), + [TRANS_STATE_SUPER_COMMITTED] = (__TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK | + __TRANS_JOIN_NOSTART), + [TRANS_STATE_COMPLETED] = (__TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK | + __TRANS_JOIN_NOSTART), +}; + +void btrfs_put_transaction(struct btrfs_transaction *transaction) +{ + WARN_ON(refcount_read(&transaction->use_count) == 0); + if (refcount_dec_and_test(&transaction->use_count)) { + BUG_ON(!list_empty(&transaction->list)); + WARN_ON(!RB_EMPTY_ROOT( + &transaction->delayed_refs.href_root.rb_root)); + WARN_ON(!RB_EMPTY_ROOT( + &transaction->delayed_refs.dirty_extent_root)); + if (transaction->delayed_refs.pending_csums) + btrfs_err(transaction->fs_info, + "pending csums is %llu", + transaction->delayed_refs.pending_csums); + /* + * If any block groups are found in ->deleted_bgs then it's + * because the transaction was aborted and a commit did not + * happen (things failed before writing the new superblock + * and calling btrfs_finish_extent_commit()), so we can not + * discard the physical locations of the block groups. + */ + while (!list_empty(&transaction->deleted_bgs)) { + struct btrfs_block_group *cache; + + cache = list_first_entry(&transaction->deleted_bgs, + struct btrfs_block_group, + bg_list); + list_del_init(&cache->bg_list); + btrfs_unfreeze_block_group(cache); + btrfs_put_block_group(cache); + } + WARN_ON(!list_empty(&transaction->dev_update_list)); + kfree(transaction); + } +} + +static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root, *tmp; + + /* + * At this point no one can be using this transaction to modify any tree + * and no one can start another transaction to modify any tree either. + */ + ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING); + + down_write(&fs_info->commit_root_sem); + + if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) + fs_info->last_reloc_trans = trans->transid; + + list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits, + dirty_list) { + list_del_init(&root->dirty_list); + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); + extent_io_tree_release(&root->dirty_log_pages); + btrfs_qgroup_clean_swapped_blocks(root); + } + + /* We can free old roots now. */ + spin_lock(&cur_trans->dropped_roots_lock); + while (!list_empty(&cur_trans->dropped_roots)) { + root = list_first_entry(&cur_trans->dropped_roots, + struct btrfs_root, root_list); + list_del_init(&root->root_list); + spin_unlock(&cur_trans->dropped_roots_lock); + btrfs_free_log(trans, root); + btrfs_drop_and_free_fs_root(fs_info, root); + spin_lock(&cur_trans->dropped_roots_lock); + } + spin_unlock(&cur_trans->dropped_roots_lock); + + up_write(&fs_info->commit_root_sem); +} + +static inline void extwriter_counter_inc(struct btrfs_transaction *trans, + unsigned int type) +{ + if (type & TRANS_EXTWRITERS) + atomic_inc(&trans->num_extwriters); +} + +static inline void extwriter_counter_dec(struct btrfs_transaction *trans, + unsigned int type) +{ + if (type & TRANS_EXTWRITERS) + atomic_dec(&trans->num_extwriters); +} + +static inline void extwriter_counter_init(struct btrfs_transaction *trans, + unsigned int type) +{ + atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0)); +} + +static inline int extwriter_counter_read(struct btrfs_transaction *trans) +{ + return atomic_read(&trans->num_extwriters); +} + +/* + * To be called after doing the chunk btree updates right after allocating a new + * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a + * chunk after all chunk btree updates and after finishing the second phase of + * chunk allocation (btrfs_create_pending_block_groups()) in case some block + * group had its chunk item insertion delayed to the second phase. + */ +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + if (!trans->chunk_bytes_reserved) + return; + + btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, + trans->chunk_bytes_reserved, NULL); + trans->chunk_bytes_reserved = 0; +} + +/* + * either allocate a new transaction or hop into the existing one + */ +static noinline int join_transaction(struct btrfs_fs_info *fs_info, + unsigned int type) +{ + struct btrfs_transaction *cur_trans; + + spin_lock(&fs_info->trans_lock); +loop: + /* The file system has been taken offline. No new transactions. */ + if (BTRFS_FS_ERROR(fs_info)) { + spin_unlock(&fs_info->trans_lock); + return -EROFS; + } + + cur_trans = fs_info->running_transaction; + if (cur_trans) { + if (TRANS_ABORTED(cur_trans)) { + spin_unlock(&fs_info->trans_lock); + return cur_trans->aborted; + } + if (btrfs_blocked_trans_types[cur_trans->state] & type) { + spin_unlock(&fs_info->trans_lock); + return -EBUSY; + } + refcount_inc(&cur_trans->use_count); + atomic_inc(&cur_trans->num_writers); + extwriter_counter_inc(cur_trans, type); + spin_unlock(&fs_info->trans_lock); + btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers); + btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters); + return 0; + } + spin_unlock(&fs_info->trans_lock); + + /* + * If we are ATTACH or TRANS_JOIN_NOSTART, we just want to catch the + * current transaction, and commit it. If there is no transaction, just + * return ENOENT. + */ + if (type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART) + return -ENOENT; + + /* + * JOIN_NOLOCK only happens during the transaction commit, so + * it is impossible that ->running_transaction is NULL + */ + BUG_ON(type == TRANS_JOIN_NOLOCK); + + cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS); + if (!cur_trans) + return -ENOMEM; + + btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers); + btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters); + + spin_lock(&fs_info->trans_lock); + if (fs_info->running_transaction) { + /* + * someone started a transaction after we unlocked. Make sure + * to redo the checks above + */ + btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); + kfree(cur_trans); + goto loop; + } else if (BTRFS_FS_ERROR(fs_info)) { + spin_unlock(&fs_info->trans_lock); + btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); + kfree(cur_trans); + return -EROFS; + } + + cur_trans->fs_info = fs_info; + atomic_set(&cur_trans->pending_ordered, 0); + init_waitqueue_head(&cur_trans->pending_wait); + atomic_set(&cur_trans->num_writers, 1); + extwriter_counter_init(cur_trans, type); + init_waitqueue_head(&cur_trans->writer_wait); + init_waitqueue_head(&cur_trans->commit_wait); + cur_trans->state = TRANS_STATE_RUNNING; + /* + * One for this trans handle, one so it will live on until we + * commit the transaction. + */ + refcount_set(&cur_trans->use_count, 2); + cur_trans->flags = 0; + cur_trans->start_time = ktime_get_seconds(); + + memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); + + cur_trans->delayed_refs.href_root = RB_ROOT_CACHED; + cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; + atomic_set(&cur_trans->delayed_refs.num_entries, 0); + + /* + * although the tree mod log is per file system and not per transaction, + * the log must never go across transaction boundaries. + */ + smp_mb(); + if (!list_empty(&fs_info->tree_mod_seq_list)) + WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when creating a fresh transaction\n"); + if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) + WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when creating a fresh transaction\n"); + atomic64_set(&fs_info->tree_mod_seq, 0); + + spin_lock_init(&cur_trans->delayed_refs.lock); + + INIT_LIST_HEAD(&cur_trans->pending_snapshots); + INIT_LIST_HEAD(&cur_trans->dev_update_list); + INIT_LIST_HEAD(&cur_trans->switch_commits); + INIT_LIST_HEAD(&cur_trans->dirty_bgs); + INIT_LIST_HEAD(&cur_trans->io_bgs); + INIT_LIST_HEAD(&cur_trans->dropped_roots); + mutex_init(&cur_trans->cache_write_mutex); + spin_lock_init(&cur_trans->dirty_bgs_lock); + INIT_LIST_HEAD(&cur_trans->deleted_bgs); + spin_lock_init(&cur_trans->dropped_roots_lock); + list_add_tail(&cur_trans->list, &fs_info->trans_list); + extent_io_tree_init(fs_info, &cur_trans->dirty_pages, + IO_TREE_TRANS_DIRTY_PAGES); + extent_io_tree_init(fs_info, &cur_trans->pinned_extents, + IO_TREE_FS_PINNED_EXTENTS); + fs_info->generation++; + cur_trans->transid = fs_info->generation; + fs_info->running_transaction = cur_trans; + cur_trans->aborted = 0; + spin_unlock(&fs_info->trans_lock); + + return 0; +} + +/* + * This does all the record keeping required to make sure that a shareable root + * is properly recorded in a given transaction. This is required to make sure + * the old root from before we joined the transaction is deleted when the + * transaction commits. + */ +static int record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int force) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; + + if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && + root->last_trans < trans->transid) || force) { + WARN_ON(!force && root->commit_root != root->node); + + /* + * see below for IN_TRANS_SETUP usage rules + * we have the reloc mutex held now, so there + * is only one writer in this function + */ + set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); + + /* make sure readers find IN_TRANS_SETUP before + * they find our root->last_trans update + */ + smp_wmb(); + + spin_lock(&fs_info->fs_roots_radix_lock); + if (root->last_trans == trans->transid && !force) { + spin_unlock(&fs_info->fs_roots_radix_lock); + return 0; + } + radix_tree_tag_set(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); + root->last_trans = trans->transid; + + /* this is pretty tricky. We don't want to + * take the relocation lock in btrfs_record_root_in_trans + * unless we're really doing the first setup for this root in + * this transaction. + * + * Normally we'd use root->last_trans as a flag to decide + * if we want to take the expensive mutex. + * + * But, we have to set root->last_trans before we + * init the relocation root, otherwise, we trip over warnings + * in ctree.c. The solution used here is to flag ourselves + * with root IN_TRANS_SETUP. When this is 1, we're still + * fixing up the reloc trees and everyone must wait. + * + * When this is zero, they can trust root->last_trans and fly + * through btrfs_record_root_in_trans without having to take the + * lock. smp_wmb() makes sure that all the writes above are + * done before we pop in the zero below + */ + ret = btrfs_init_reloc_root(trans, root); + smp_mb__before_atomic(); + clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); + } + return ret; +} + + +void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_transaction *cur_trans = trans->transaction; + + /* Add ourselves to the transaction dropped list */ + spin_lock(&cur_trans->dropped_roots_lock); + list_add_tail(&root->root_list, &cur_trans->dropped_roots); + spin_unlock(&cur_trans->dropped_roots_lock); + + /* Make sure we don't try to update the root at commit time */ + spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); +} + +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + return 0; + + /* + * see record_root_in_trans for comments about IN_TRANS_SETUP usage + * and barriers + */ + smp_rmb(); + if (root->last_trans == trans->transid && + !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state)) + return 0; + + mutex_lock(&fs_info->reloc_mutex); + ret = record_root_in_trans(trans, root, 0); + mutex_unlock(&fs_info->reloc_mutex); + + return ret; +} + +static inline int is_transaction_blocked(struct btrfs_transaction *trans) +{ + return (trans->state >= TRANS_STATE_COMMIT_START && + trans->state < TRANS_STATE_UNBLOCKED && + !TRANS_ABORTED(trans)); +} + +/* wait for commit against the current transaction to become unblocked + * when this is done, it is safe to start a new transaction, but the current + * transaction might not be fully on disk. + */ +static void wait_current_trans(struct btrfs_fs_info *fs_info) +{ + struct btrfs_transaction *cur_trans; + + spin_lock(&fs_info->trans_lock); + cur_trans = fs_info->running_transaction; + if (cur_trans && is_transaction_blocked(cur_trans)) { + refcount_inc(&cur_trans->use_count); + spin_unlock(&fs_info->trans_lock); + + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); + wait_event(fs_info->transaction_wait, + cur_trans->state >= TRANS_STATE_UNBLOCKED || + TRANS_ABORTED(cur_trans)); + btrfs_put_transaction(cur_trans); + } else { + spin_unlock(&fs_info->trans_lock); + } +} + +static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type) +{ + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + return 0; + + if (type == TRANS_START) + return 1; + + return 0; +} + +static inline bool need_reserve_reloc_root(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + if (!fs_info->reloc_ctl || + !test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + root->reloc_root) + return false; + + return true; +} + +static struct btrfs_trans_handle * +start_transaction(struct btrfs_root *root, unsigned int num_items, + unsigned int type, enum btrfs_reserve_flush_enum flush, + bool enforce_qgroups) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_trans_handle *h; + struct btrfs_transaction *cur_trans; + u64 num_bytes = 0; + u64 qgroup_reserved = 0; + bool reloc_reserved = false; + bool do_chunk_alloc = false; + int ret; + + if (BTRFS_FS_ERROR(fs_info)) + return ERR_PTR(-EROFS); + + if (current->journal_info) { + WARN_ON(type & TRANS_EXTWRITERS); + h = current->journal_info; + refcount_inc(&h->use_count); + WARN_ON(refcount_read(&h->use_count) > 2); + h->orig_rsv = h->block_rsv; + h->block_rsv = NULL; + goto got_it; + } + + /* + * Do the reservation before we join the transaction so we can do all + * the appropriate flushing if need be. + */ + if (num_items && root != fs_info->chunk_root) { + struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv; + u64 delayed_refs_bytes = 0; + + qgroup_reserved = num_items * fs_info->nodesize; + /* + * Use prealloc for now, as there might be a currently running + * transaction that could free this reserved space prematurely + * by committing. + */ + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserved, + enforce_qgroups, false); + if (ret) + return ERR_PTR(ret); + + /* + * We want to reserve all the bytes we may need all at once, so + * we only do 1 enospc flushing cycle per transaction start. We + * accomplish this by simply assuming we'll do num_items worth + * of delayed refs updates in this trans handle, and refill that + * amount for whatever is missing in the reserve. + */ + num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); + if (flush == BTRFS_RESERVE_FLUSH_ALL && + !btrfs_block_rsv_full(delayed_refs_rsv)) { + delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, + num_items); + num_bytes += delayed_refs_bytes; + } + + /* + * Do the reservation for the relocation root creation + */ + if (need_reserve_reloc_root(root)) { + num_bytes += fs_info->nodesize; + reloc_reserved = true; + } + + ret = btrfs_reserve_metadata_bytes(fs_info, rsv, num_bytes, flush); + if (ret) + goto reserve_fail; + if (delayed_refs_bytes) { + btrfs_migrate_to_delayed_refs_rsv(fs_info, delayed_refs_bytes); + num_bytes -= delayed_refs_bytes; + } + btrfs_block_rsv_add_bytes(rsv, num_bytes, true); + + if (rsv->space_info->force_alloc) + do_chunk_alloc = true; + } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && + !btrfs_block_rsv_full(delayed_refs_rsv)) { + /* + * Some people call with btrfs_start_transaction(root, 0) + * because they can be throttled, but have some other mechanism + * for reserving space. We still want these guys to refill the + * delayed block_rsv so just add 1 items worth of reservation + * here. + */ + ret = btrfs_delayed_refs_rsv_refill(fs_info, flush); + if (ret) + goto reserve_fail; + } +again: + h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS); + if (!h) { + ret = -ENOMEM; + goto alloc_fail; + } + + /* + * If we are JOIN_NOLOCK we're already committing a transaction and + * waiting on this guy, so we don't need to do the sb_start_intwrite + * because we're already holding a ref. We need this because we could + * have raced in and did an fsync() on a file which can kick a commit + * and then we deadlock with somebody doing a freeze. + * + * If we are ATTACH, it means we just want to catch the current + * transaction and commit it, so we needn't do sb_start_intwrite(). + */ + if (type & __TRANS_FREEZABLE) + sb_start_intwrite(fs_info->sb); + + if (may_wait_transaction(fs_info, type)) + wait_current_trans(fs_info); + + do { + ret = join_transaction(fs_info, type); + if (ret == -EBUSY) { + wait_current_trans(fs_info); + if (unlikely(type == TRANS_ATTACH || + type == TRANS_JOIN_NOSTART)) + ret = -ENOENT; + } + } while (ret == -EBUSY); + + if (ret < 0) + goto join_fail; + + cur_trans = fs_info->running_transaction; + + h->transid = cur_trans->transid; + h->transaction = cur_trans; + refcount_set(&h->use_count, 1); + h->fs_info = root->fs_info; + + h->type = type; + INIT_LIST_HEAD(&h->new_bgs); + + smp_mb(); + if (cur_trans->state >= TRANS_STATE_COMMIT_START && + may_wait_transaction(fs_info, type)) { + current->journal_info = h; + btrfs_commit_transaction(h); + goto again; + } + + if (num_bytes) { + trace_btrfs_space_reservation(fs_info, "transaction", + h->transid, num_bytes, 1); + h->block_rsv = &fs_info->trans_block_rsv; + h->bytes_reserved = num_bytes; + h->reloc_reserved = reloc_reserved; + } + + /* + * Now that we have found a transaction to be a part of, convert the + * qgroup reservation from prealloc to pertrans. A different transaction + * can't race in and free our pertrans out from under us. + */ + if (qgroup_reserved) + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + +got_it: + if (!current->journal_info) + current->journal_info = h; + + /* + * If the space_info is marked ALLOC_FORCE then we'll get upgraded to + * ALLOC_FORCE the first run through, and then we won't allocate for + * anybody else who races in later. We don't care about the return + * value here. + */ + if (do_chunk_alloc && num_bytes) { + u64 flags = h->block_rsv->space_info->flags; + + btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags), + CHUNK_ALLOC_NO_FORCE); + } + + /* + * btrfs_record_root_in_trans() needs to alloc new extents, and may + * call btrfs_join_transaction() while we're also starting a + * transaction. + * + * Thus it need to be called after current->journal_info initialized, + * or we can deadlock. + */ + ret = btrfs_record_root_in_trans(h, root); + if (ret) { + /* + * The transaction handle is fully initialized and linked with + * other structures so it needs to be ended in case of errors, + * not just freed. + */ + btrfs_end_transaction(h); + return ERR_PTR(ret); + } + + return h; + +join_fail: + if (type & __TRANS_FREEZABLE) + sb_end_intwrite(fs_info->sb); + kmem_cache_free(btrfs_trans_handle_cachep, h); +alloc_fail: + if (num_bytes) + btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv, + num_bytes, NULL); +reserve_fail: + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); + return ERR_PTR(ret); +} + +struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, + unsigned int num_items) +{ + return start_transaction(root, num_items, TRANS_START, + BTRFS_RESERVE_FLUSH_ALL, true); +} + +struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( + struct btrfs_root *root, + unsigned int num_items) +{ + return start_transaction(root, num_items, TRANS_START, + BTRFS_RESERVE_FLUSH_ALL_STEAL, false); +} + +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) +{ + return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH, + true); +} + +struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root) +{ + return start_transaction(root, 0, TRANS_JOIN_NOLOCK, + BTRFS_RESERVE_NO_FLUSH, true); +} + +/* + * Similar to regular join but it never starts a transaction when none is + * running or when there's a running one at a state >= TRANS_STATE_UNBLOCKED. + * This is similar to btrfs_attach_transaction() but it allows the join to + * happen if the transaction commit already started but it's not yet in the + * "doing" phase (the state is < TRANS_STATE_COMMIT_DOING). + */ +struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root) +{ + return start_transaction(root, 0, TRANS_JOIN_NOSTART, + BTRFS_RESERVE_NO_FLUSH, true); +} + +/* + * btrfs_attach_transaction() - catch the running transaction + * + * It is used when we want to commit the current the transaction, but + * don't want to start a new one. + * + * Note: If this function return -ENOENT, it just means there is no + * running transaction. But it is possible that the inactive transaction + * is still in the memory, not fully on disk. If you hope there is no + * inactive transaction in the fs when -ENOENT is returned, you should + * invoke + * btrfs_attach_transaction_barrier() + */ +struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) +{ + return start_transaction(root, 0, TRANS_ATTACH, + BTRFS_RESERVE_NO_FLUSH, true); +} + +/* + * btrfs_attach_transaction_barrier() - catch the running transaction + * + * It is similar to the above function, the difference is this one + * will wait for all the inactive transactions until they fully + * complete. + */ +struct btrfs_trans_handle * +btrfs_attach_transaction_barrier(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + + trans = start_transaction(root, 0, TRANS_ATTACH, + BTRFS_RESERVE_NO_FLUSH, true); + if (trans == ERR_PTR(-ENOENT)) { + int ret; + + ret = btrfs_wait_for_commit(root->fs_info, 0); + if (ret) + return ERR_PTR(ret); + } + + return trans; +} + +/* Wait for a transaction commit to reach at least the given state. */ +static noinline void wait_for_commit(struct btrfs_transaction *commit, + const enum btrfs_trans_state min_state) +{ + struct btrfs_fs_info *fs_info = commit->fs_info; + u64 transid = commit->transid; + bool put = false; + + /* + * At the moment this function is called with min_state either being + * TRANS_STATE_COMPLETED or TRANS_STATE_SUPER_COMMITTED. + */ + if (min_state == TRANS_STATE_COMPLETED) + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); + else + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); + + while (1) { + wait_event(commit->commit_wait, commit->state >= min_state); + if (put) + btrfs_put_transaction(commit); + + if (min_state < TRANS_STATE_COMPLETED) + break; + + /* + * A transaction isn't really completed until all of the + * previous transactions are completed, but with fsync we can + * end up with SUPER_COMMITTED transactions before a COMPLETED + * transaction. Wait for those. + */ + + spin_lock(&fs_info->trans_lock); + commit = list_first_entry_or_null(&fs_info->trans_list, + struct btrfs_transaction, + list); + if (!commit || commit->transid > transid) { + spin_unlock(&fs_info->trans_lock); + break; + } + refcount_inc(&commit->use_count); + put = true; + spin_unlock(&fs_info->trans_lock); + } +} + +int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) +{ + struct btrfs_transaction *cur_trans = NULL, *t; + int ret = 0; + + if (transid) { + if (transid <= fs_info->last_trans_committed) + goto out; + + /* find specified transaction */ + spin_lock(&fs_info->trans_lock); + list_for_each_entry(t, &fs_info->trans_list, list) { + if (t->transid == transid) { + cur_trans = t; + refcount_inc(&cur_trans->use_count); + ret = 0; + break; + } + if (t->transid > transid) { + ret = 0; + break; + } + } + spin_unlock(&fs_info->trans_lock); + + /* + * The specified transaction doesn't exist, or we + * raced with btrfs_commit_transaction + */ + if (!cur_trans) { + if (transid > fs_info->last_trans_committed) + ret = -EINVAL; + goto out; + } + } else { + /* find newest transaction that is committing | committed */ + spin_lock(&fs_info->trans_lock); + list_for_each_entry_reverse(t, &fs_info->trans_list, + list) { + if (t->state >= TRANS_STATE_COMMIT_START) { + if (t->state == TRANS_STATE_COMPLETED) + break; + cur_trans = t; + refcount_inc(&cur_trans->use_count); + break; + } + } + spin_unlock(&fs_info->trans_lock); + if (!cur_trans) + goto out; /* nothing committing|committed */ + } + + wait_for_commit(cur_trans, TRANS_STATE_COMPLETED); + ret = cur_trans->aborted; + btrfs_put_transaction(cur_trans); +out: + return ret; +} + +void btrfs_throttle(struct btrfs_fs_info *fs_info) +{ + wait_current_trans(fs_info); +} + +bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + + if (cur_trans->state >= TRANS_STATE_COMMIT_START || + test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags)) + return true; + + if (btrfs_check_space_for_delayed_refs(trans->fs_info)) + return true; + + return !!btrfs_block_rsv_check(&trans->fs_info->global_block_rsv, 50); +} + +static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) + +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + if (!trans->block_rsv) { + ASSERT(!trans->bytes_reserved); + return; + } + + if (!trans->bytes_reserved) + return; + + ASSERT(trans->block_rsv == &fs_info->trans_block_rsv); + trace_btrfs_space_reservation(fs_info, "transaction", + trans->transid, trans->bytes_reserved, 0); + btrfs_block_rsv_release(fs_info, trans->block_rsv, + trans->bytes_reserved, NULL); + trans->bytes_reserved = 0; +} + +static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, + int throttle) +{ + struct btrfs_fs_info *info = trans->fs_info; + struct btrfs_transaction *cur_trans = trans->transaction; + int err = 0; + + if (refcount_read(&trans->use_count) > 1) { + refcount_dec(&trans->use_count); + trans->block_rsv = trans->orig_rsv; + return 0; + } + + btrfs_trans_release_metadata(trans); + trans->block_rsv = NULL; + + btrfs_create_pending_block_groups(trans); + + btrfs_trans_release_chunk_metadata(trans); + + if (trans->type & __TRANS_FREEZABLE) + sb_end_intwrite(info->sb); + + WARN_ON(cur_trans != info->running_transaction); + WARN_ON(atomic_read(&cur_trans->num_writers) < 1); + atomic_dec(&cur_trans->num_writers); + extwriter_counter_dec(cur_trans, trans->type); + + cond_wake_up(&cur_trans->writer_wait); + + btrfs_lockdep_release(info, btrfs_trans_num_extwriters); + btrfs_lockdep_release(info, btrfs_trans_num_writers); + + btrfs_put_transaction(cur_trans); + + if (current->journal_info == trans) + current->journal_info = NULL; + + if (throttle) + btrfs_run_delayed_iputs(info); + + if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) { + wake_up_process(info->transaction_kthread); + if (TRANS_ABORTED(trans)) + err = trans->aborted; + else + err = -EROFS; + } + + kmem_cache_free(btrfs_trans_handle_cachep, trans); + return err; +} + +int btrfs_end_transaction(struct btrfs_trans_handle *trans) +{ + return __btrfs_end_transaction(trans, 0); +} + +int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans) +{ + return __btrfs_end_transaction(trans, 1); +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are sent to disk but does not wait on them + */ +int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, + struct extent_io_tree *dirty_pages, int mark) +{ + int err = 0; + int werr = 0; + struct address_space *mapping = fs_info->btree_inode->i_mapping; + struct extent_state *cached_state = NULL; + u64 start = 0; + u64 end; + + while (find_first_extent_bit(dirty_pages, start, &start, &end, + mark, &cached_state)) { + bool wait_writeback = false; + + err = convert_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, + mark, &cached_state); + /* + * convert_extent_bit can return -ENOMEM, which is most of the + * time a temporary error. So when it happens, ignore the error + * and wait for writeback of this range to finish - because we + * failed to set the bit EXTENT_NEED_WAIT for the range, a call + * to __btrfs_wait_marked_extents() would not know that + * writeback for this range started and therefore wouldn't + * wait for it to finish - we don't want to commit a + * superblock that points to btree nodes/leafs for which + * writeback hasn't finished yet (and without errors). + * We cleanup any entries left in the io tree when committing + * the transaction (through extent_io_tree_release()). + */ + if (err == -ENOMEM) { + err = 0; + wait_writeback = true; + } + if (!err) + err = filemap_fdatawrite_range(mapping, start, end); + if (err) + werr = err; + else if (wait_writeback) + werr = filemap_fdatawait_range(mapping, start, end); + free_extent_state(cached_state); + cached_state = NULL; + cond_resched(); + start = end + 1; + } + return werr; +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit. We wait + * on all the pages and clear them from the dirty pages state tree + */ +static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, + struct extent_io_tree *dirty_pages) +{ + int err = 0; + int werr = 0; + struct address_space *mapping = fs_info->btree_inode->i_mapping; + struct extent_state *cached_state = NULL; + u64 start = 0; + u64 end; + + while (find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_NEED_WAIT, &cached_state)) { + /* + * Ignore -ENOMEM errors returned by clear_extent_bit(). + * When committing the transaction, we'll remove any entries + * left in the io tree. For a log commit, we don't remove them + * after committing the log because the tree can be accessed + * concurrently - we do it only at transaction commit time when + * it's safe to do it (through extent_io_tree_release()). + */ + err = clear_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, &cached_state); + if (err == -ENOMEM) + err = 0; + if (!err) + err = filemap_fdatawait_range(mapping, start, end); + if (err) + werr = err; + free_extent_state(cached_state); + cached_state = NULL; + cond_resched(); + start = end + 1; + } + if (err) + werr = err; + return werr; +} + +static int btrfs_wait_extents(struct btrfs_fs_info *fs_info, + struct extent_io_tree *dirty_pages) +{ + bool errors = false; + int err; + + err = __btrfs_wait_marked_extents(fs_info, dirty_pages); + if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags)) + errors = true; + + if (errors && !err) + err = -EIO; + return err; +} + +int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) +{ + struct btrfs_fs_info *fs_info = log_root->fs_info; + struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages; + bool errors = false; + int err; + + ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + + err = __btrfs_wait_marked_extents(fs_info, dirty_pages); + if ((mark & EXTENT_DIRTY) && + test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags)) + errors = true; + + if ((mark & EXTENT_NEW) && + test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags)) + errors = true; + + if (errors && !err) + err = -EIO; + return err; +} + +/* + * When btree blocks are allocated the corresponding extents are marked dirty. + * This function ensures such extents are persisted on disk for transaction or + * log commit. + * + * @trans: transaction whose dirty pages we'd like to write + */ +static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans) +{ + int ret; + int ret2; + struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct blk_plug plug; + + blk_start_plug(&plug); + ret = btrfs_write_marked_extents(fs_info, dirty_pages, EXTENT_DIRTY); + blk_finish_plug(&plug); + ret2 = btrfs_wait_extents(fs_info, dirty_pages); + + extent_io_tree_release(&trans->transaction->dirty_pages); + + if (ret) + return ret; + else if (ret2) + return ret2; + else + return 0; +} + +/* + * this is used to update the root pointer in the tree of tree roots. + * + * But, in the case of the extent allocation tree, updating the root + * pointer may allocate blocks which may change the root of the extent + * allocation tree. + * + * So, this loops and repeats and makes sure the cowonly root didn't + * change while the root pointer was being updated in the metadata. + */ +static int update_cowonly_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + u64 old_root_bytenr; + u64 old_root_used; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *tree_root = fs_info->tree_root; + + old_root_used = btrfs_root_used(&root->root_item); + + while (1) { + old_root_bytenr = btrfs_root_bytenr(&root->root_item); + if (old_root_bytenr == root->node->start && + old_root_used == btrfs_root_used(&root->root_item)) + break; + + btrfs_set_root_node(&root->root_item, root->node); + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + &root->root_item); + if (ret) + return ret; + + old_root_used = btrfs_root_used(&root->root_item); + } + + return 0; +} + +/* + * update all the cowonly tree roots on disk + * + * The error handling in this function may not be obvious. Any of the + * failures will cause the file system to go offline. We still need + * to clean up the delayed refs. + */ +static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; + struct list_head *io_bgs = &trans->transaction->io_bgs; + struct list_head *next; + struct extent_buffer *eb; + int ret; + + /* + * At this point no one can be using this transaction to modify any tree + * and no one can start another transaction to modify any tree either. + */ + ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); + + eb = btrfs_lock_root_node(fs_info->tree_root); + ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, + 0, &eb, BTRFS_NESTING_COW); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + if (ret) + return ret; + + ret = btrfs_run_dev_stats(trans); + if (ret) + return ret; + ret = btrfs_run_dev_replace(trans); + if (ret) + return ret; + ret = btrfs_run_qgroups(trans); + if (ret) + return ret; + + ret = btrfs_setup_space_cache(trans); + if (ret) + return ret; + +again: + while (!list_empty(&fs_info->dirty_cowonly_roots)) { + struct btrfs_root *root; + next = fs_info->dirty_cowonly_roots.next; + list_del_init(next); + root = list_entry(next, struct btrfs_root, dirty_list); + clear_bit(BTRFS_ROOT_DIRTY, &root->state); + + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); + ret = update_cowonly_root(trans, root); + if (ret) + return ret; + } + + /* Now flush any delayed refs generated by updating all of the roots */ + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + if (ret) + return ret; + + while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { + ret = btrfs_write_dirty_block_groups(trans); + if (ret) + return ret; + + /* + * We're writing the dirty block groups, which could generate + * delayed refs, which could generate more dirty block groups, + * so we want to keep this flushing in this loop to make sure + * everything gets run. + */ + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + if (ret) + return ret; + } + + if (!list_empty(&fs_info->dirty_cowonly_roots)) + goto again; + + /* Update dev-replace pointer once everything is committed */ + fs_info->dev_replace.committed_cursor_left = + fs_info->dev_replace.cursor_left_last_write_of_item; + + return 0; +} + +/* + * If we had a pending drop we need to see if there are any others left in our + * dead roots list, and if not clear our bit and wake any waiters. + */ +void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + /* + * We put the drop in progress roots at the front of the list, so if the + * first entry doesn't have UNFINISHED_DROP set we can wake everybody + * up. + */ + spin_lock(&fs_info->trans_lock); + if (!list_empty(&fs_info->dead_roots)) { + struct btrfs_root *root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, + root_list); + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) { + spin_unlock(&fs_info->trans_lock); + return; + } + } + spin_unlock(&fs_info->trans_lock); + + btrfs_wake_unfinished_drop(fs_info); +} + +/* + * dead roots are old snapshots that need to be deleted. This allocates + * a dirty root struct and adds it into the list of dead roots that need to + * be deleted + */ +void btrfs_add_dead_root(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + spin_lock(&fs_info->trans_lock); + if (list_empty(&root->root_list)) { + btrfs_grab_root(root); + + /* We want to process the partially complete drops first. */ + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) + list_add(&root->root_list, &fs_info->dead_roots); + else + list_add_tail(&root->root_list, &fs_info->dead_roots); + } + spin_unlock(&fs_info->trans_lock); +} + +/* + * Update each subvolume root and its relocation root, if it exists, in the tree + * of tree roots. Also free log roots if they exist. + */ +static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *gang[8]; + int i; + int ret; + + /* + * At this point no one can be using this transaction to modify any tree + * and no one can start another transaction to modify any tree either. + */ + ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); + + spin_lock(&fs_info->fs_roots_radix_lock); + while (1) { + ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang), + BTRFS_ROOT_TRANS_TAG); + if (ret == 0) + break; + for (i = 0; i < ret; i++) { + struct btrfs_root *root = gang[i]; + int ret2; + + /* + * At this point we can neither have tasks logging inodes + * from a root nor trying to commit a log tree. + */ + ASSERT(atomic_read(&root->log_writers) == 0); + ASSERT(atomic_read(&root->log_commit[0]) == 0); + ASSERT(atomic_read(&root->log_commit[1]) == 0); + + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); + + btrfs_free_log(trans, root); + ret2 = btrfs_update_reloc_root(trans, root); + if (ret2) + return ret2; + + /* see comments in should_cow_block() */ + clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); + smp_mb__after_atomic(); + + if (root->commit_root != root->node) { + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); + btrfs_set_root_node(&root->root_item, + root->node); + } + + ret2 = btrfs_update_root(trans, fs_info->tree_root, + &root->root_key, + &root->root_item); + if (ret2) + return ret2; + spin_lock(&fs_info->fs_roots_radix_lock); + btrfs_qgroup_free_meta_all_pertrans(root); + } + } + spin_unlock(&fs_info->fs_roots_radix_lock); + return 0; +} + +/* + * defrag a given btree. + * Every leaf in the btree is read and defragged. + */ +int btrfs_defrag_root(struct btrfs_root *root) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_trans_handle *trans; + int ret; + + if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) + return 0; + + while (1) { + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + + ret = btrfs_defrag_leaves(trans, root); + + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(info); + cond_resched(); + + if (btrfs_fs_closing(info) || ret != -EAGAIN) + break; + + if (btrfs_defrag_cancelled(info)) { + btrfs_debug(info, "defrag_root cancelled"); + ret = -EAGAIN; + break; + } + } + clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); + return ret; +} + +/* + * Do all special snapshot related qgroup dirty hack. + * + * Will do all needed qgroup inherit and dirty hack like switch commit + * roots inside one transaction and write all btree into disk, to make + * qgroup works. + */ +static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_root *src, + struct btrfs_root *parent, + struct btrfs_qgroup_inherit *inherit, + u64 dst_objectid) +{ + struct btrfs_fs_info *fs_info = src->fs_info; + int ret; + + /* + * Save some performance in the case that qgroups are not + * enabled. If this check races with the ioctl, rescan will + * kick in anyway. + */ + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + + /* + * Ensure dirty @src will be committed. Or, after coming + * commit_fs_roots() and switch_commit_roots(), any dirty but not + * recorded root will never be updated again, causing an outdated root + * item. + */ + ret = record_root_in_trans(trans, src, 1); + if (ret) + return ret; + + /* + * btrfs_qgroup_inherit relies on a consistent view of the usage for the + * src root, so we must run the delayed refs here. + * + * However this isn't particularly fool proof, because there's no + * synchronization keeping us from changing the tree after this point + * before we do the qgroup_inherit, or even from making changes while + * we're doing the qgroup_inherit. But that's a problem for the future, + * for now flush the delayed refs to narrow the race window where the + * qgroup counters could end up wrong. + */ + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + ret = commit_fs_roots(trans); + if (ret) + goto out; + ret = btrfs_qgroup_account_extents(trans); + if (ret < 0) + goto out; + + /* Now qgroup are all updated, we can inherit it to new qgroups */ + ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid, + inherit); + if (ret < 0) + goto out; + + /* + * Now we do a simplified commit transaction, which will: + * 1) commit all subvolume and extent tree + * To ensure all subvolume and extent tree have a valid + * commit_root to accounting later insert_dir_item() + * 2) write all btree blocks onto disk + * This is to make sure later btree modification will be cowed + * Or commit_root can be populated and cause wrong qgroup numbers + * In this simplified commit, we don't really care about other trees + * like chunk and root tree, as they won't affect qgroup. + * And we don't write super to avoid half committed status. + */ + ret = commit_cowonly_roots(trans); + if (ret) + goto out; + switch_commit_roots(trans); + ret = btrfs_write_and_wait_transaction(trans); + if (ret) + btrfs_handle_fs_error(fs_info, ret, + "Error while writing out transaction for qgroup"); + +out: + /* + * Force parent root to be updated, as we recorded it before so its + * last_trans == cur_transid. + * Or it won't be committed again onto disk after later + * insert_dir_item() + */ + if (!ret) + ret = record_root_in_trans(trans, parent, 1); + return ret; +} + +/* + * new snapshots need to be created at a very specific time in the + * transaction commit. This does the actual creation. + * + * Note: + * If the error which may affect the commitment of the current transaction + * happens, we should return the error number. If the error which just affect + * the creation of the pending snapshots, just return 0. + */ +static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) +{ + + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_key key; + struct btrfs_root_item *new_root_item; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *root = pending->root; + struct btrfs_root *parent_root; + struct btrfs_block_rsv *rsv; + struct inode *parent_inode = pending->dir; + struct btrfs_path *path; + struct btrfs_dir_item *dir_item; + struct extent_buffer *tmp; + struct extent_buffer *old; + struct timespec64 cur_time; + int ret = 0; + u64 to_reserve = 0; + u64 index = 0; + u64 objectid; + u64 root_flags; + unsigned int nofs_flags; + struct fscrypt_name fname; + + ASSERT(pending->path); + path = pending->path; + + ASSERT(pending->root_item); + new_root_item = pending->root_item; + + /* + * We're inside a transaction and must make sure that any potential + * allocations with GFP_KERNEL in fscrypt won't recurse back to + * filesystem. + */ + nofs_flags = memalloc_nofs_save(); + pending->error = fscrypt_setup_filename(parent_inode, + &pending->dentry->d_name, 0, + &fname); + memalloc_nofs_restore(nofs_flags); + if (pending->error) + goto free_pending; + + pending->error = btrfs_get_free_objectid(tree_root, &objectid); + if (pending->error) + goto free_fname; + + /* + * Make qgroup to skip current new snapshot's qgroupid, as it is + * accounted by later btrfs_qgroup_inherit(). + */ + btrfs_set_skip_qgroup(trans, objectid); + + btrfs_reloc_pre_snapshot(pending, &to_reserve); + + if (to_reserve > 0) { + pending->error = btrfs_block_rsv_add(fs_info, + &pending->block_rsv, + to_reserve, + BTRFS_RESERVE_NO_FLUSH); + if (pending->error) + goto clear_skip_qgroup; + } + + key.objectid = objectid; + key.offset = (u64)-1; + key.type = BTRFS_ROOT_ITEM_KEY; + + rsv = trans->block_rsv; + trans->block_rsv = &pending->block_rsv; + trans->bytes_reserved = trans->block_rsv->reserved; + trace_btrfs_space_reservation(fs_info, "transaction", + trans->transid, + trans->bytes_reserved, 1); + parent_root = BTRFS_I(parent_inode)->root; + ret = record_root_in_trans(trans, parent_root, 0); + if (ret) + goto fail; + cur_time = current_time(parent_inode); + + /* + * insert the directory item + */ + ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + + /* check if there is a file/dir which has the same name. */ + dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, + btrfs_ino(BTRFS_I(parent_inode)), + &fname.disk_name, 0); + if (dir_item != NULL && !IS_ERR(dir_item)) { + pending->error = -EEXIST; + goto dir_item_existed; + } else if (IS_ERR(dir_item)) { + ret = PTR_ERR(dir_item); + btrfs_abort_transaction(trans, ret); + goto fail; + } + btrfs_release_path(path); + + /* + * pull in the delayed directory update + * and the delayed inode item + * otherwise we corrupt the FS during + * snapshot + */ + ret = btrfs_run_delayed_items(trans); + if (ret) { /* Transaction aborted */ + btrfs_abort_transaction(trans, ret); + goto fail; + } + + ret = record_root_in_trans(trans, root, 0); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + btrfs_set_root_last_snapshot(&root->root_item, trans->transid); + memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); + btrfs_check_and_init_root_item(new_root_item); + + root_flags = btrfs_root_flags(new_root_item); + if (pending->readonly) + root_flags |= BTRFS_ROOT_SUBVOL_RDONLY; + else + root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; + btrfs_set_root_flags(new_root_item, root_flags); + + btrfs_set_root_generation_v2(new_root_item, + trans->transid); + generate_random_guid(new_root_item->uuid); + memcpy(new_root_item->parent_uuid, root->root_item.uuid, + BTRFS_UUID_SIZE); + if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) { + memset(new_root_item->received_uuid, 0, + sizeof(new_root_item->received_uuid)); + memset(&new_root_item->stime, 0, sizeof(new_root_item->stime)); + memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime)); + btrfs_set_root_stransid(new_root_item, 0); + btrfs_set_root_rtransid(new_root_item, 0); + } + btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec); + btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec); + btrfs_set_root_otransid(new_root_item, trans->transid); + + old = btrfs_lock_root_node(root); + ret = btrfs_cow_block(trans, root, old, NULL, 0, &old, + BTRFS_NESTING_COW); + if (ret) { + btrfs_tree_unlock(old); + free_extent_buffer(old); + btrfs_abort_transaction(trans, ret); + goto fail; + } + + ret = btrfs_copy_root(trans, root, old, &tmp, objectid); + /* clean up in any case */ + btrfs_tree_unlock(old); + free_extent_buffer(old); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + /* see comments in should_cow_block() */ + set_bit(BTRFS_ROOT_FORCE_COW, &root->state); + smp_wmb(); + + btrfs_set_root_node(new_root_item, tmp); + /* record when the snapshot was created in key.offset */ + key.offset = trans->transid; + ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); + btrfs_tree_unlock(tmp); + free_extent_buffer(tmp); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + + /* + * insert root back/forward references + */ + ret = btrfs_add_root_ref(trans, objectid, + parent_root->root_key.objectid, + btrfs_ino(BTRFS_I(parent_inode)), index, + &fname.disk_name); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + + key.offset = (u64)-1; + pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev); + if (IS_ERR(pending->snap)) { + ret = PTR_ERR(pending->snap); + pending->snap = NULL; + btrfs_abort_transaction(trans, ret); + goto fail; + } + + ret = btrfs_reloc_post_snapshot(trans, pending); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + + /* + * Do special qgroup accounting for snapshot, as we do some qgroup + * snapshot hack to do fast snapshot. + * To co-operate with that hack, we do hack again. + * Or snapshot will be greatly slowed down by a subtree qgroup rescan + */ + ret = qgroup_account_snapshot(trans, root, parent_root, + pending->inherit, objectid); + if (ret < 0) + goto fail; + + ret = btrfs_insert_dir_item(trans, &fname.disk_name, + BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, + index); + /* We have check then name at the beginning, so it is impossible. */ + BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + + btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + + fname.disk_name.len * 2); + parent_inode->i_mtime = inode_set_ctime_current(parent_inode); + ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + ret = btrfs_uuid_tree_add(trans, new_root_item->uuid, + BTRFS_UUID_KEY_SUBVOL, + objectid); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) { + ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + objectid); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + } + +fail: + pending->error = ret; +dir_item_existed: + trans->block_rsv = rsv; + trans->bytes_reserved = 0; +clear_skip_qgroup: + btrfs_clear_skip_qgroup(trans); +free_fname: + fscrypt_free_filename(&fname); +free_pending: + kfree(new_root_item); + pending->root_item = NULL; + btrfs_free_path(path); + pending->path = NULL; + + return ret; +} + +/* + * create all the snapshots we've scheduled for creation + */ +static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans) +{ + struct btrfs_pending_snapshot *pending, *next; + struct list_head *head = &trans->transaction->pending_snapshots; + int ret = 0; + + list_for_each_entry_safe(pending, next, head, list) { + list_del(&pending->list); + ret = create_pending_snapshot(trans, pending); + if (ret) + break; + } + return ret; +} + +static void update_super_roots(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root_item *root_item; + struct btrfs_super_block *super; + + super = fs_info->super_copy; + + root_item = &fs_info->chunk_root->root_item; + super->chunk_root = root_item->bytenr; + super->chunk_root_generation = root_item->generation; + super->chunk_root_level = root_item->level; + + root_item = &fs_info->tree_root->root_item; + super->root = root_item->bytenr; + super->generation = root_item->generation; + super->root_level = root_item->level; + if (btrfs_test_opt(fs_info, SPACE_CACHE)) + super->cache_generation = root_item->generation; + else if (test_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags)) + super->cache_generation = 0; + if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) + super->uuid_tree_generation = root_item->generation; +} + +int btrfs_transaction_in_commit(struct btrfs_fs_info *info) +{ + struct btrfs_transaction *trans; + int ret = 0; + + spin_lock(&info->trans_lock); + trans = info->running_transaction; + if (trans) + ret = (trans->state >= TRANS_STATE_COMMIT_START); + spin_unlock(&info->trans_lock); + return ret; +} + +int btrfs_transaction_blocked(struct btrfs_fs_info *info) +{ + struct btrfs_transaction *trans; + int ret = 0; + + spin_lock(&info->trans_lock); + trans = info->running_transaction; + if (trans) + ret = is_transaction_blocked(trans); + spin_unlock(&info->trans_lock); + return ret; +} + +void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_transaction *cur_trans; + + /* Kick the transaction kthread. */ + set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); + wake_up_process(fs_info->transaction_kthread); + + /* take transaction reference */ + cur_trans = trans->transaction; + refcount_inc(&cur_trans->use_count); + + btrfs_end_transaction(trans); + + /* + * Wait for the current transaction commit to start and block + * subsequent transaction joins + */ + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); + wait_event(fs_info->transaction_blocked_wait, + cur_trans->state >= TRANS_STATE_COMMIT_START || + TRANS_ABORTED(cur_trans)); + btrfs_put_transaction(cur_trans); +} + +static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_transaction *cur_trans = trans->transaction; + + WARN_ON(refcount_read(&trans->use_count) > 1); + + btrfs_abort_transaction(trans, err); + + spin_lock(&fs_info->trans_lock); + + /* + * If the transaction is removed from the list, it means this + * transaction has been committed successfully, so it is impossible + * to call the cleanup function. + */ + BUG_ON(list_empty(&cur_trans->list)); + + if (cur_trans == fs_info->running_transaction) { + cur_trans->state = TRANS_STATE_COMMIT_DOING; + spin_unlock(&fs_info->trans_lock); + + /* + * The thread has already released the lockdep map as reader + * already in btrfs_commit_transaction(). + */ + btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers); + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + + spin_lock(&fs_info->trans_lock); + } + + /* + * Now that we know no one else is still using the transaction we can + * remove the transaction from the list of transactions. This avoids + * the transaction kthread from cleaning up the transaction while some + * other task is still using it, which could result in a use-after-free + * on things like log trees, as it forces the transaction kthread to + * wait for this transaction to be cleaned up by us. + */ + list_del_init(&cur_trans->list); + + spin_unlock(&fs_info->trans_lock); + + btrfs_cleanup_one_transaction(trans->transaction, fs_info); + + spin_lock(&fs_info->trans_lock); + if (cur_trans == fs_info->running_transaction) + fs_info->running_transaction = NULL; + spin_unlock(&fs_info->trans_lock); + + if (trans->type & __TRANS_FREEZABLE) + sb_end_intwrite(fs_info->sb); + btrfs_put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); + + trace_btrfs_transaction_commit(fs_info); + + if (current->journal_info == trans) + current->journal_info = NULL; + + /* + * If relocation is running, we can't cancel scrub because that will + * result in a deadlock. Before relocating a block group, relocation + * pauses scrub, then starts and commits a transaction before unpausing + * scrub. If the transaction commit is being done by the relocation + * task or triggered by another task and the relocation task is waiting + * for the commit, and we end up here due to an error in the commit + * path, then calling btrfs_scrub_cancel() will deadlock, as we are + * asking for scrub to stop while having it asked to be paused higher + * above in relocation code. + */ + if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) + btrfs_scrub_cancel(fs_info); + + kmem_cache_free(btrfs_trans_handle_cachep, trans); +} + +/* + * Release reserved delayed ref space of all pending block groups of the + * transaction and remove them from the list + */ +static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group *block_group, *tmp; + + list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { + btrfs_delayed_refs_rsv_release(fs_info, 1); + list_del_init(&block_group->bg_list); + } +} + +static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) +{ + /* + * We use try_to_writeback_inodes_sb() here because if we used + * btrfs_start_delalloc_roots we would deadlock with fs freeze. + * Currently are holding the fs freeze lock, if we do an async flush + * we'll do btrfs_join_transaction() and deadlock because we need to + * wait for the fs freeze lock. Using the direct flushing we benefit + * from already being in a transaction and our join_transaction doesn't + * have to re-take the fs freeze lock. + * + * Note that try_to_writeback_inodes_sb() will only trigger writeback + * if it can read lock sb->s_umount. It will always be able to lock it, + * except when the filesystem is being unmounted or being frozen, but in + * those cases sync_filesystem() is called, which results in calling + * writeback_inodes_sb() while holding a write lock on sb->s_umount. + * Note that we don't call writeback_inodes_sb() directly, because it + * will emit a warning if sb->s_umount is not locked. + */ + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) + try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); + return 0; +} + +static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) +{ + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); +} + +/* + * Add a pending snapshot associated with the given transaction handle to the + * respective handle. This must be called after the transaction commit started + * and while holding fs_info->trans_lock. + * This serves to guarantee a caller of btrfs_commit_transaction() that it can + * safely free the pending snapshot pointer in case btrfs_commit_transaction() + * returns an error. + */ +static void add_pending_snapshot(struct btrfs_trans_handle *trans) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + + if (!trans->pending_snapshot) + return; + + lockdep_assert_held(&trans->fs_info->trans_lock); + ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP); + + list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); +} + +static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval) +{ + fs_info->commit_stats.commit_count++; + fs_info->commit_stats.last_commit_dur = interval; + fs_info->commit_stats.max_commit_dur = + max_t(u64, fs_info->commit_stats.max_commit_dur, interval); + fs_info->commit_stats.total_commit_dur += interval; +} + +int btrfs_commit_transaction(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_transaction *prev_trans = NULL; + int ret; + ktime_t start_time; + ktime_t interval; + + ASSERT(refcount_read(&trans->use_count) == 1); + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); + + clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); + + /* Stop the commit early if ->aborted is set */ + if (TRANS_ABORTED(cur_trans)) { + ret = cur_trans->aborted; + goto lockdep_trans_commit_start_release; + } + + btrfs_trans_release_metadata(trans); + trans->block_rsv = NULL; + + /* + * We only want one transaction commit doing the flushing so we do not + * waste a bunch of time on lock contention on the extent root node. + */ + if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING, + &cur_trans->delayed_refs.flags)) { + /* + * Make a pass through all the delayed refs we have so far. + * Any running threads may add more while we are here. + */ + ret = btrfs_run_delayed_refs(trans, 0); + if (ret) + goto lockdep_trans_commit_start_release; + } + + btrfs_create_pending_block_groups(trans); + + if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) { + int run_it = 0; + + /* this mutex is also taken before trying to set + * block groups readonly. We need to make sure + * that nobody has set a block group readonly + * after a extents from that block group have been + * allocated for cache files. btrfs_set_block_group_ro + * will wait for the transaction to commit if it + * finds BTRFS_TRANS_DIRTY_BG_RUN set. + * + * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure + * only one process starts all the block group IO. It wouldn't + * hurt to have more than one go through, but there's no + * real advantage to it either. + */ + mutex_lock(&fs_info->ro_block_group_mutex); + if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN, + &cur_trans->flags)) + run_it = 1; + mutex_unlock(&fs_info->ro_block_group_mutex); + + if (run_it) { + ret = btrfs_start_dirty_block_groups(trans); + if (ret) + goto lockdep_trans_commit_start_release; + } + } + + spin_lock(&fs_info->trans_lock); + if (cur_trans->state >= TRANS_STATE_COMMIT_PREP) { + enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; + + add_pending_snapshot(trans); + + spin_unlock(&fs_info->trans_lock); + refcount_inc(&cur_trans->use_count); + + if (trans->in_fsync) + want_state = TRANS_STATE_SUPER_COMMITTED; + + btrfs_trans_state_lockdep_release(fs_info, + BTRFS_LOCKDEP_TRANS_COMMIT_PREP); + ret = btrfs_end_transaction(trans); + wait_for_commit(cur_trans, want_state); + + if (TRANS_ABORTED(cur_trans)) + ret = cur_trans->aborted; + + btrfs_put_transaction(cur_trans); + + return ret; + } + + cur_trans->state = TRANS_STATE_COMMIT_PREP; + wake_up(&fs_info->transaction_blocked_wait); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); + + if (cur_trans->list.prev != &fs_info->trans_list) { + enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; + + if (trans->in_fsync) + want_state = TRANS_STATE_SUPER_COMMITTED; + + prev_trans = list_entry(cur_trans->list.prev, + struct btrfs_transaction, list); + if (prev_trans->state < want_state) { + refcount_inc(&prev_trans->use_count); + spin_unlock(&fs_info->trans_lock); + + wait_for_commit(prev_trans, want_state); + + ret = READ_ONCE(prev_trans->aborted); + + btrfs_put_transaction(prev_trans); + if (ret) + goto lockdep_release; + spin_lock(&fs_info->trans_lock); + } + } else { + /* + * The previous transaction was aborted and was already removed + * from the list of transactions at fs_info->trans_list. So we + * abort to prevent writing a new superblock that reflects a + * corrupt state (pointing to trees with unwritten nodes/leafs). + */ + if (BTRFS_FS_ERROR(fs_info)) { + spin_unlock(&fs_info->trans_lock); + ret = -EROFS; + goto lockdep_release; + } + } + + cur_trans->state = TRANS_STATE_COMMIT_START; + wake_up(&fs_info->transaction_blocked_wait); + spin_unlock(&fs_info->trans_lock); + + /* + * Get the time spent on the work done by the commit thread and not + * the time spent waiting on a previous commit + */ + start_time = ktime_get_ns(); + + extwriter_counter_dec(cur_trans, trans->type); + + ret = btrfs_start_delalloc_flush(fs_info); + if (ret) + goto lockdep_release; + + ret = btrfs_run_delayed_items(trans); + if (ret) + goto lockdep_release; + + /* + * The thread has started/joined the transaction thus it holds the + * lockdep map as a reader. It has to release it before acquiring the + * lockdep map as a writer. + */ + btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); + btrfs_might_wait_for_event(fs_info, btrfs_trans_num_extwriters); + wait_event(cur_trans->writer_wait, + extwriter_counter_read(cur_trans) == 0); + + /* some pending stuffs might be added after the previous flush. */ + ret = btrfs_run_delayed_items(trans); + if (ret) { + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); + goto cleanup_transaction; + } + + btrfs_wait_delalloc_flush(fs_info); + + /* + * Wait for all ordered extents started by a fast fsync that joined this + * transaction. Otherwise if this transaction commits before the ordered + * extents complete we lose logged data after a power failure. + */ + btrfs_might_wait_for_event(fs_info, btrfs_trans_pending_ordered); + wait_event(cur_trans->pending_wait, + atomic_read(&cur_trans->pending_ordered) == 0); + + btrfs_scrub_pause(fs_info); + /* + * Ok now we need to make sure to block out any other joins while we + * commit the transaction. We could have started a join before setting + * COMMIT_DOING so make sure to wait for num_writers to == 1 again. + */ + spin_lock(&fs_info->trans_lock); + add_pending_snapshot(trans); + cur_trans->state = TRANS_STATE_COMMIT_DOING; + spin_unlock(&fs_info->trans_lock); + + /* + * The thread has started/joined the transaction thus it holds the + * lockdep map as a reader. It has to release it before acquiring the + * lockdep map as a writer. + */ + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); + btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers); + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + + /* + * Make lockdep happy by acquiring the state locks after + * btrfs_trans_num_writers is released. If we acquired the state locks + * before releasing the btrfs_trans_num_writers lock then lockdep would + * complain because we did not follow the reverse order unlocking rule. + */ + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); + + /* + * We've started the commit, clear the flag in case we were triggered to + * do an async commit but somebody else started before the transaction + * kthread could do the work. + */ + clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); + + if (TRANS_ABORTED(cur_trans)) { + ret = cur_trans->aborted; + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); + goto scrub_continue; + } + /* + * the reloc mutex makes sure that we stop + * the balancing code from coming in and moving + * extents around in the middle of the commit + */ + mutex_lock(&fs_info->reloc_mutex); + + /* + * We needn't worry about the delayed items because we will + * deal with them in create_pending_snapshot(), which is the + * core function of the snapshot creation. + */ + ret = create_pending_snapshots(trans); + if (ret) + goto unlock_reloc; + + /* + * We insert the dir indexes of the snapshots and update the inode + * of the snapshots' parents after the snapshot creation, so there + * are some delayed items which are not dealt with. Now deal with + * them. + * + * We needn't worry that this operation will corrupt the snapshots, + * because all the tree which are snapshoted will be forced to COW + * the nodes and leaves. + */ + ret = btrfs_run_delayed_items(trans); + if (ret) + goto unlock_reloc; + + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + if (ret) + goto unlock_reloc; + + /* + * make sure none of the code above managed to slip in a + * delayed item + */ + btrfs_assert_delayed_root_empty(fs_info); + + WARN_ON(cur_trans != trans->transaction); + + ret = commit_fs_roots(trans); + if (ret) + goto unlock_reloc; + + /* commit_fs_roots gets rid of all the tree log roots, it is now + * safe to free the root of tree log roots + */ + btrfs_free_log_root_tree(trans, fs_info); + + /* + * Since fs roots are all committed, we can get a quite accurate + * new_roots. So let's do quota accounting. + */ + ret = btrfs_qgroup_account_extents(trans); + if (ret < 0) + goto unlock_reloc; + + ret = commit_cowonly_roots(trans); + if (ret) + goto unlock_reloc; + + /* + * The tasks which save the space cache and inode cache may also + * update ->aborted, check it. + */ + if (TRANS_ABORTED(cur_trans)) { + ret = cur_trans->aborted; + goto unlock_reloc; + } + + cur_trans = fs_info->running_transaction; + + btrfs_set_root_node(&fs_info->tree_root->root_item, + fs_info->tree_root->node); + list_add_tail(&fs_info->tree_root->dirty_list, + &cur_trans->switch_commits); + + btrfs_set_root_node(&fs_info->chunk_root->root_item, + fs_info->chunk_root->node); + list_add_tail(&fs_info->chunk_root->dirty_list, + &cur_trans->switch_commits); + + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_set_root_node(&fs_info->block_group_root->root_item, + fs_info->block_group_root->node); + list_add_tail(&fs_info->block_group_root->dirty_list, + &cur_trans->switch_commits); + } + + switch_commit_roots(trans); + + ASSERT(list_empty(&cur_trans->dirty_bgs)); + ASSERT(list_empty(&cur_trans->io_bgs)); + update_super_roots(fs_info); + + btrfs_set_super_log_root(fs_info->super_copy, 0); + btrfs_set_super_log_root_level(fs_info->super_copy, 0); + memcpy(fs_info->super_for_commit, fs_info->super_copy, + sizeof(*fs_info->super_copy)); + + btrfs_commit_device_sizes(cur_trans); + + clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); + clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); + + btrfs_trans_release_chunk_metadata(trans); + + /* + * Before changing the transaction state to TRANS_STATE_UNBLOCKED and + * setting fs_info->running_transaction to NULL, lock tree_log_mutex to + * make sure that before we commit our superblock, no other task can + * start a new transaction and commit a log tree before we commit our + * superblock. Anyone trying to commit a log tree locks this mutex before + * writing its superblock. + */ + mutex_lock(&fs_info->tree_log_mutex); + + spin_lock(&fs_info->trans_lock); + cur_trans->state = TRANS_STATE_UNBLOCKED; + fs_info->running_transaction = NULL; + spin_unlock(&fs_info->trans_lock); + mutex_unlock(&fs_info->reloc_mutex); + + wake_up(&fs_info->transaction_wait); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); + + /* If we have features changed, wake up the cleaner to update sysfs. */ + if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) && + fs_info->cleaner_kthread) + wake_up_process(fs_info->cleaner_kthread); + + ret = btrfs_write_and_wait_transaction(trans); + if (ret) { + btrfs_handle_fs_error(fs_info, ret, + "Error while writing out transaction"); + mutex_unlock(&fs_info->tree_log_mutex); + goto scrub_continue; + } + + ret = write_all_supers(fs_info, 0); + /* + * the super is written, we can safely allow the tree-loggers + * to go about their business + */ + mutex_unlock(&fs_info->tree_log_mutex); + if (ret) + goto scrub_continue; + + /* + * We needn't acquire the lock here because there is no other task + * which can change it. + */ + cur_trans->state = TRANS_STATE_SUPER_COMMITTED; + wake_up(&cur_trans->commit_wait); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); + + btrfs_finish_extent_commit(trans); + + if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) + btrfs_clear_space_info_full(fs_info); + + fs_info->last_trans_committed = cur_trans->transid; + /* + * We needn't acquire the lock here because there is no other task + * which can change it. + */ + cur_trans->state = TRANS_STATE_COMPLETED; + wake_up(&cur_trans->commit_wait); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); + + spin_lock(&fs_info->trans_lock); + list_del_init(&cur_trans->list); + spin_unlock(&fs_info->trans_lock); + + btrfs_put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); + + if (trans->type & __TRANS_FREEZABLE) + sb_end_intwrite(fs_info->sb); + + trace_btrfs_transaction_commit(fs_info); + + interval = ktime_get_ns() - start_time; + + btrfs_scrub_continue(fs_info); + + if (current->journal_info == trans) + current->journal_info = NULL; + + kmem_cache_free(btrfs_trans_handle_cachep, trans); + + update_commit_stats(fs_info, interval); + + return ret; + +unlock_reloc: + mutex_unlock(&fs_info->reloc_mutex); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); +scrub_continue: + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); + btrfs_scrub_continue(fs_info); +cleanup_transaction: + btrfs_trans_release_metadata(trans); + btrfs_cleanup_pending_block_groups(trans); + btrfs_trans_release_chunk_metadata(trans); + trans->block_rsv = NULL; + btrfs_warn(fs_info, "Skipping commit of aborted transaction."); + if (current->journal_info == trans) + current->journal_info = NULL; + cleanup_transaction(trans, ret); + + return ret; + +lockdep_release: + btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); + goto cleanup_transaction; + +lockdep_trans_commit_start_release: + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); + btrfs_end_transaction(trans); + return ret; +} + +/* + * return < 0 if error + * 0 if there are no more dead_roots at the time of call + * 1 there are more to be processed, call me again + * + * The return value indicates there are certainly more snapshots to delete, but + * if there comes a new one during processing, it may return 0. We don't mind, + * because btrfs_commit_super will poke cleaner thread and it will process it a + * few seconds later. + */ +int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + int ret; + + spin_lock(&fs_info->trans_lock); + if (list_empty(&fs_info->dead_roots)) { + spin_unlock(&fs_info->trans_lock); + return 0; + } + root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + list_del_init(&root->root_list); + spin_unlock(&fs_info->trans_lock); + + btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid); + + btrfs_kill_all_delayed_nodes(root); + + if (btrfs_header_backref_rev(root->node) < + BTRFS_MIXED_BACKREF_REV) + ret = btrfs_drop_snapshot(root, 0, 0); + else + ret = btrfs_drop_snapshot(root, 1, 0); + + btrfs_put_root(root); + return (ret < 0) ? 0 : 1; +} + +/* + * We only mark the transaction aborted and then set the file system read-only. + * This will prevent new transactions from starting or trying to join this + * one. + * + * This means that error recovery at the call site is limited to freeing + * any local memory allocations and passing the error code up without + * further cleanup. The transaction should complete as it normally would + * in the call path but will return -EIO. + * + * We'll complete the cleanup in btrfs_end_transaction and + * btrfs_commit_transaction. + */ +void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + WRITE_ONCE(trans->aborted, errno); + WRITE_ONCE(trans->transaction->aborted, errno); + if (first_hit && errno == -ENOSPC) + btrfs_dump_space_info_for_trans_abort(fs_info); + /* Wake up anybody who may be waiting on this transaction */ + wake_up(&fs_info->transaction_wait); + wake_up(&fs_info->transaction_blocked_wait); + __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); +} + +int __init btrfs_transaction_init(void) +{ + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", + sizeof(struct btrfs_trans_handle), 0, + SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); + if (!btrfs_trans_handle_cachep) + return -ENOMEM; + return 0; +} + +void __cold btrfs_transaction_exit(void) +{ + kmem_cache_destroy(btrfs_trans_handle_cachep); +} diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h new file mode 100644 index 0000000000..238a0ab85d --- /dev/null +++ b/fs/btrfs/transaction.h @@ -0,0 +1,275 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#ifndef BTRFS_TRANSACTION_H +#define BTRFS_TRANSACTION_H + +#include +#include "btrfs_inode.h" +#include "delayed-ref.h" +#include "ctree.h" +#include "misc.h" + +/* Radix-tree tag for roots that are part of the trasaction. */ +#define BTRFS_ROOT_TRANS_TAG 0 + +enum btrfs_trans_state { + TRANS_STATE_RUNNING, + TRANS_STATE_COMMIT_PREP, + TRANS_STATE_COMMIT_START, + TRANS_STATE_COMMIT_DOING, + TRANS_STATE_UNBLOCKED, + TRANS_STATE_SUPER_COMMITTED, + TRANS_STATE_COMPLETED, + TRANS_STATE_MAX, +}; + +#define BTRFS_TRANS_HAVE_FREE_BGS 0 +#define BTRFS_TRANS_DIRTY_BG_RUN 1 +#define BTRFS_TRANS_CACHE_ENOSPC 2 + +struct btrfs_transaction { + u64 transid; + /* + * total external writers(USERSPACE/START/ATTACH) in this + * transaction, it must be zero before the transaction is + * being committed + */ + atomic_t num_extwriters; + /* + * total writers in this transaction, it must be zero before the + * transaction can end + */ + atomic_t num_writers; + refcount_t use_count; + + unsigned long flags; + + /* Be protected by fs_info->trans_lock when we want to change it. */ + enum btrfs_trans_state state; + int aborted; + struct list_head list; + struct extent_io_tree dirty_pages; + time64_t start_time; + wait_queue_head_t writer_wait; + wait_queue_head_t commit_wait; + struct list_head pending_snapshots; + struct list_head dev_update_list; + struct list_head switch_commits; + struct list_head dirty_bgs; + + /* + * There is no explicit lock which protects io_bgs, rather its + * consistency is implied by the fact that all the sites which modify + * it do so under some form of transaction critical section, namely: + * + * - btrfs_start_dirty_block_groups - This function can only ever be + * run by one of the transaction committers. Refer to + * BTRFS_TRANS_DIRTY_BG_RUN usage in btrfs_commit_transaction + * + * - btrfs_write_dirty_blockgroups - this is called by + * commit_cowonly_roots from transaction critical section + * (TRANS_STATE_COMMIT_DOING) + * + * - btrfs_cleanup_dirty_bgs - called on transaction abort + */ + struct list_head io_bgs; + struct list_head dropped_roots; + struct extent_io_tree pinned_extents; + + /* + * we need to make sure block group deletion doesn't race with + * free space cache writeout. This mutex keeps them from stomping + * on each other + */ + struct mutex cache_write_mutex; + spinlock_t dirty_bgs_lock; + /* Protected by spin lock fs_info->unused_bgs_lock. */ + struct list_head deleted_bgs; + spinlock_t dropped_roots_lock; + struct btrfs_delayed_ref_root delayed_refs; + struct btrfs_fs_info *fs_info; + + /* + * Number of ordered extents the transaction must wait for before + * committing. These are ordered extents started by a fast fsync. + */ + atomic_t pending_ordered; + wait_queue_head_t pending_wait; +}; + +enum { + ENUM_BIT(__TRANS_FREEZABLE), + ENUM_BIT(__TRANS_START), + ENUM_BIT(__TRANS_ATTACH), + ENUM_BIT(__TRANS_JOIN), + ENUM_BIT(__TRANS_JOIN_NOLOCK), + ENUM_BIT(__TRANS_DUMMY), + ENUM_BIT(__TRANS_JOIN_NOSTART), +}; + +#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) +#define TRANS_ATTACH (__TRANS_ATTACH) +#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE) +#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK) +#define TRANS_JOIN_NOSTART (__TRANS_JOIN_NOSTART) + +#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) + +struct btrfs_trans_handle { + u64 transid; + u64 bytes_reserved; + u64 chunk_bytes_reserved; + unsigned long delayed_ref_updates; + struct btrfs_transaction *transaction; + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *orig_rsv; + /* Set by a task that wants to create a snapshot. */ + struct btrfs_pending_snapshot *pending_snapshot; + refcount_t use_count; + unsigned int type; + /* + * Error code of transaction abort, set outside of locks and must use + * the READ_ONCE/WRITE_ONCE access + */ + short aborted; + bool adding_csums; + bool allocating_chunk; + bool removing_chunk; + bool reloc_reserved; + bool in_fsync; + struct btrfs_fs_info *fs_info; + struct list_head new_bgs; +}; + +/* + * The abort status can be changed between calls and is not protected by locks. + * This accepts btrfs_transaction and btrfs_trans_handle as types. Once it's + * set to a non-zero value it does not change, so the macro should be in checks + * but is not necessary for further reads of the value. + */ +#define TRANS_ABORTED(trans) (unlikely(READ_ONCE((trans)->aborted))) + +struct btrfs_pending_snapshot { + struct dentry *dentry; + struct inode *dir; + struct btrfs_root *root; + struct btrfs_root_item *root_item; + struct btrfs_root *snap; + struct btrfs_qgroup_inherit *inherit; + struct btrfs_path *path; + /* block reservation for the operation */ + struct btrfs_block_rsv block_rsv; + /* extra metadata reservation for relocation */ + int error; + /* Preallocated anonymous block device number */ + dev_t anon_dev; + bool readonly; + struct list_head list; +}; + +static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) +{ + spin_lock(&inode->lock); + inode->last_trans = trans->transaction->transid; + inode->last_sub_trans = inode->root->log_transid; + inode->last_log_commit = inode->last_sub_trans - 1; + spin_unlock(&inode->lock); +} + +/* + * Make qgroup codes to skip given qgroupid, means the old/new_roots for + * qgroup won't contain the qgroupid in it. + */ +static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans, + u64 qgroupid) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + WARN_ON(delayed_refs->qgroup_to_skip); + delayed_refs->qgroup_to_skip = qgroupid; +} + +static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + WARN_ON(!delayed_refs->qgroup_to_skip); + delayed_refs->qgroup_to_skip = 0; +} + +bool __cold abort_should_print_stack(int errno); + +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact stack trace is reported for some errors. + */ +#define btrfs_abort_transaction(trans, errno) \ +do { \ + bool first = false; \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((trans)->fs_info->fs_state))) { \ + first = true; \ + if (WARN(abort_should_print_stack(errno), \ + KERN_ERR \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno))) { \ + /* Stack trace printed. */ \ + } else { \ + btrfs_err((trans)->fs_info, \ + "Transaction aborted (error %d)", \ + (errno)); \ + } \ + } \ + __btrfs_abort_transaction((trans), __func__, \ + __LINE__, (errno), first); \ +} while (0) + +int btrfs_end_transaction(struct btrfs_trans_handle *trans); +struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, + unsigned int num_items); +struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( + struct btrfs_root *root, + unsigned int num_items); +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_attach_transaction_barrier( + struct btrfs_root *root); +int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid); + +void btrfs_add_dead_root(struct btrfs_root *root); +int btrfs_defrag_root(struct btrfs_root *root); +void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info); +int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info); +int btrfs_commit_transaction(struct btrfs_trans_handle *trans); +void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); +int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); +bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans); +void btrfs_throttle(struct btrfs_fs_info *fs_info); +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, + struct extent_io_tree *dirty_pages, int mark); +int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark); +int btrfs_transaction_blocked(struct btrfs_fs_info *info); +int btrfs_transaction_in_commit(struct btrfs_fs_info *info); +void btrfs_put_transaction(struct btrfs_transaction *transaction); +void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); +void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit); + +int __init btrfs_transaction_init(void); +void __cold btrfs_transaction_exit(void); + +#endif diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c new file mode 100644 index 0000000000..cc6bc59851 --- /dev/null +++ b/fs/btrfs/tree-checker.c @@ -0,0 +1,2037 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) Qu Wenruo 2017. All rights reserved. + */ + +/* + * The module is used to catch unexpected/corrupted tree block data. + * Such behavior can be caused either by a fuzzed image or bugs. + * + * The objective is to do leaf/node validation checks when tree block is read + * from disk, and check *every* possible member, so other code won't + * need to checking them again. + * + * Due to the potential and unwanted damage, every checker needs to be + * carefully reviewed otherwise so it does not prevent mount of valid images. + */ + +#include +#include +#include +#include "messages.h" +#include "ctree.h" +#include "tree-checker.h" +#include "disk-io.h" +#include "compression.h" +#include "volumes.h" +#include "misc.h" +#include "fs.h" +#include "accessors.h" +#include "file-item.h" +#include "inode-item.h" + +/* + * Error message should follow the following format: + * corrupt : , [, ] + * + * @type: leaf or node + * @identifier: the necessary info to locate the leaf/node. + * It's recommended to decode key.objecitd/offset if it's + * meaningful. + * @reason: describe the error + * @bad_value: optional, it's recommended to output bad value and its + * expected value (range). + * + * Since comma is used to separate the components, only space is allowed + * inside each component. + */ + +/* + * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt. + * Allows callers to customize the output. + */ +__printf(3, 4) +__cold +static void generic_err(const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + const struct btrfs_fs_info *fs_info = eb->fs_info; + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(fs_info, + "corrupt %s: root=%llu block=%llu slot=%d, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, &vaf); + va_end(args); +} + +/* + * Customized reporter for extent data item, since its key objectid and + * offset has its own meaning. + */ +__printf(3, 4) +__cold +static void file_extent_err(const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + const struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(fs_info, + "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, + key.objectid, key.offset, &vaf); + va_end(args); +} + +/* + * Return 0 if the btrfs_file_extent_##name is aligned to @alignment + * Else return 1 + */ +#define CHECK_FE_ALIGNED(leaf, slot, fi, name, alignment) \ +({ \ + if (unlikely(!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), \ + (alignment)))) \ + file_extent_err((leaf), (slot), \ + "invalid %s for file extent, have %llu, should be aligned to %u", \ + (#name), btrfs_file_extent_##name((leaf), (fi)), \ + (alignment)); \ + (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))); \ +}) + +static u64 file_extent_end(struct extent_buffer *leaf, + struct btrfs_key *key, + struct btrfs_file_extent_item *extent) +{ + u64 end; + u64 len; + + if (btrfs_file_extent_type(leaf, extent) == BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_ram_bytes(leaf, extent); + end = ALIGN(key->offset + len, leaf->fs_info->sectorsize); + } else { + len = btrfs_file_extent_num_bytes(leaf, extent); + end = key->offset + len; + } + return end; +} + +/* + * Customized report for dir_item, the only new important information is + * key->objectid, which represents inode number + */ +__printf(3, 4) +__cold +static void dir_item_err(const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + const struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(fs_info, + "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, + key.objectid, &vaf); + va_end(args); +} + +/* + * This functions checks prev_key->objectid, to ensure current key and prev_key + * share the same objectid as inode number. + * + * This is to detect missing INODE_ITEM in subvolume trees. + * + * Return true if everything is OK or we don't need to check. + * Return false if anything is wrong. + */ +static bool check_prev_ino(struct extent_buffer *leaf, + struct btrfs_key *key, int slot, + struct btrfs_key *prev_key) +{ + /* No prev key, skip check */ + if (slot == 0) + return true; + + /* Only these key->types needs to be checked */ + ASSERT(key->type == BTRFS_XATTR_ITEM_KEY || + key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_DIR_INDEX_KEY || + key->type == BTRFS_DIR_ITEM_KEY || + key->type == BTRFS_EXTENT_DATA_KEY); + + /* + * Only subvolume trees along with their reloc trees need this check. + * Things like log tree doesn't follow this ino requirement. + */ + if (!is_fstree(btrfs_header_owner(leaf))) + return true; + + if (key->objectid == prev_key->objectid) + return true; + + /* Error found */ + dir_item_err(leaf, slot, + "invalid previous key objectid, have %llu expect %llu", + prev_key->objectid, key->objectid); + return false; +} +static int check_extent_data_item(struct extent_buffer *leaf, + struct btrfs_key *key, int slot, + struct btrfs_key *prev_key) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_file_extent_item *fi; + u32 sectorsize = fs_info->sectorsize; + u32 item_size = btrfs_item_size(leaf, slot); + u64 extent_end; + + if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { + file_extent_err(leaf, slot, +"unaligned file_offset for file extent, have %llu should be aligned to %u", + key->offset, sectorsize); + return -EUCLEAN; + } + + /* + * Previous key must have the same key->objectid (ino). + * It can be XATTR_ITEM, INODE_ITEM or just another EXTENT_DATA. + * But if objectids mismatch, it means we have a missing + * INODE_ITEM. + */ + if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) + return -EUCLEAN; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + /* + * Make sure the item contains at least inline header, so the file + * extent type is not some garbage. + */ + if (unlikely(item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START)) { + file_extent_err(leaf, slot, + "invalid item size, have %u expect [%zu, %u)", + item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START, + SZ_4K); + return -EUCLEAN; + } + if (unlikely(btrfs_file_extent_type(leaf, fi) >= + BTRFS_NR_FILE_EXTENT_TYPES)) { + file_extent_err(leaf, slot, + "invalid type for file extent, have %u expect range [0, %u]", + btrfs_file_extent_type(leaf, fi), + BTRFS_NR_FILE_EXTENT_TYPES - 1); + return -EUCLEAN; + } + + /* + * Support for new compression/encryption must introduce incompat flag, + * and must be caught in open_ctree(). + */ + if (unlikely(btrfs_file_extent_compression(leaf, fi) >= + BTRFS_NR_COMPRESS_TYPES)) { + file_extent_err(leaf, slot, + "invalid compression for file extent, have %u expect range [0, %u]", + btrfs_file_extent_compression(leaf, fi), + BTRFS_NR_COMPRESS_TYPES - 1); + return -EUCLEAN; + } + if (unlikely(btrfs_file_extent_encryption(leaf, fi))) { + file_extent_err(leaf, slot, + "invalid encryption for file extent, have %u expect 0", + btrfs_file_extent_encryption(leaf, fi)); + return -EUCLEAN; + } + if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { + /* Inline extent must have 0 as key offset */ + if (unlikely(key->offset)) { + file_extent_err(leaf, slot, + "invalid file_offset for inline file extent, have %llu expect 0", + key->offset); + return -EUCLEAN; + } + + /* Compressed inline extent has no on-disk size, skip it */ + if (btrfs_file_extent_compression(leaf, fi) != + BTRFS_COMPRESS_NONE) + return 0; + + /* Uncompressed inline extent size must match item size */ + if (unlikely(item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + + btrfs_file_extent_ram_bytes(leaf, fi))) { + file_extent_err(leaf, slot, + "invalid ram_bytes for uncompressed inline extent, have %u expect %llu", + item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START + + btrfs_file_extent_ram_bytes(leaf, fi)); + return -EUCLEAN; + } + return 0; + } + + /* Regular or preallocated extent has fixed item size */ + if (unlikely(item_size != sizeof(*fi))) { + file_extent_err(leaf, slot, + "invalid item size for reg/prealloc file extent, have %u expect %zu", + item_size, sizeof(*fi)); + return -EUCLEAN; + } + if (unlikely(CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize))) + return -EUCLEAN; + + /* Catch extent end overflow */ + if (unlikely(check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi), + key->offset, &extent_end))) { + file_extent_err(leaf, slot, + "extent end overflow, have file offset %llu extent num bytes %llu", + key->offset, + btrfs_file_extent_num_bytes(leaf, fi)); + return -EUCLEAN; + } + + /* + * Check that no two consecutive file extent items, in the same leaf, + * present ranges that overlap each other. + */ + if (slot > 0 && + prev_key->objectid == key->objectid && + prev_key->type == BTRFS_EXTENT_DATA_KEY) { + struct btrfs_file_extent_item *prev_fi; + u64 prev_end; + + prev_fi = btrfs_item_ptr(leaf, slot - 1, + struct btrfs_file_extent_item); + prev_end = file_extent_end(leaf, prev_key, prev_fi); + if (unlikely(prev_end > key->offset)) { + file_extent_err(leaf, slot - 1, +"file extent end range (%llu) goes beyond start offset (%llu) of the next file extent", + prev_end, key->offset); + return -EUCLEAN; + } + } + + return 0; +} + +static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, + int slot, struct btrfs_key *prev_key) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + u32 sectorsize = fs_info->sectorsize; + const u32 csumsize = fs_info->csum_size; + + if (unlikely(key->objectid != BTRFS_EXTENT_CSUM_OBJECTID)) { + generic_err(leaf, slot, + "invalid key objectid for csum item, have %llu expect %llu", + key->objectid, BTRFS_EXTENT_CSUM_OBJECTID); + return -EUCLEAN; + } + if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { + generic_err(leaf, slot, + "unaligned key offset for csum item, have %llu should be aligned to %u", + key->offset, sectorsize); + return -EUCLEAN; + } + if (unlikely(!IS_ALIGNED(btrfs_item_size(leaf, slot), csumsize))) { + generic_err(leaf, slot, + "unaligned item size for csum item, have %u should be aligned to %u", + btrfs_item_size(leaf, slot), csumsize); + return -EUCLEAN; + } + if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) { + u64 prev_csum_end; + u32 prev_item_size; + + prev_item_size = btrfs_item_size(leaf, slot - 1); + prev_csum_end = (prev_item_size / csumsize) * sectorsize; + prev_csum_end += prev_key->offset; + if (unlikely(prev_csum_end > key->offset)) { + generic_err(leaf, slot - 1, +"csum end range (%llu) goes beyond the start range (%llu) of the next csum item", + prev_csum_end, key->offset); + return -EUCLEAN; + } + } + return 0; +} + +/* Inode item error output has the same format as dir_item_err() */ +#define inode_item_err(eb, slot, fmt, ...) \ + dir_item_err(eb, slot, fmt, __VA_ARGS__) + +static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key, + int slot) +{ + struct btrfs_key item_key; + bool is_inode_item; + + btrfs_item_key_to_cpu(leaf, &item_key, slot); + is_inode_item = (item_key.type == BTRFS_INODE_ITEM_KEY); + + /* For XATTR_ITEM, location key should be all 0 */ + if (item_key.type == BTRFS_XATTR_ITEM_KEY) { + if (unlikely(key->objectid != 0 || key->type != 0 || + key->offset != 0)) + return -EUCLEAN; + return 0; + } + + if (unlikely((key->objectid < BTRFS_FIRST_FREE_OBJECTID || + key->objectid > BTRFS_LAST_FREE_OBJECTID) && + key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID && + key->objectid != BTRFS_FREE_INO_OBJECTID)) { + if (is_inode_item) { + generic_err(leaf, slot, + "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu", + key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, + BTRFS_FIRST_FREE_OBJECTID, + BTRFS_LAST_FREE_OBJECTID, + BTRFS_FREE_INO_OBJECTID); + } else { + dir_item_err(leaf, slot, +"invalid location key objectid: has %llu expect %llu or [%llu, %llu] or %llu", + key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, + BTRFS_FIRST_FREE_OBJECTID, + BTRFS_LAST_FREE_OBJECTID, + BTRFS_FREE_INO_OBJECTID); + } + return -EUCLEAN; + } + if (unlikely(key->offset != 0)) { + if (is_inode_item) + inode_item_err(leaf, slot, + "invalid key offset: has %llu expect 0", + key->offset); + else + dir_item_err(leaf, slot, + "invalid location key offset:has %llu expect 0", + key->offset); + return -EUCLEAN; + } + return 0; +} + +static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, + int slot) +{ + struct btrfs_key item_key; + bool is_root_item; + + btrfs_item_key_to_cpu(leaf, &item_key, slot); + is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY); + + /* + * Bad rootid for reloc trees. + * + * Reloc trees are only for subvolume trees, other trees only need + * to be COWed to be relocated. + */ + if (unlikely(is_root_item && key->objectid == BTRFS_TREE_RELOC_OBJECTID && + !is_fstree(key->offset))) { + generic_err(leaf, slot, + "invalid reloc tree for root %lld, root id is not a subvolume tree", + key->offset); + return -EUCLEAN; + } + + /* No such tree id */ + if (unlikely(key->objectid == 0)) { + if (is_root_item) + generic_err(leaf, slot, "invalid root id 0"); + else + dir_item_err(leaf, slot, + "invalid location key root id 0"); + return -EUCLEAN; + } + + /* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */ + if (unlikely(!is_fstree(key->objectid) && !is_root_item)) { + dir_item_err(leaf, slot, + "invalid location key objectid, have %llu expect [%llu, %llu]", + key->objectid, BTRFS_FIRST_FREE_OBJECTID, + BTRFS_LAST_FREE_OBJECTID); + return -EUCLEAN; + } + + /* + * ROOT_ITEM with non-zero offset means this is a snapshot, created at + * @offset transid. + * Furthermore, for location key in DIR_ITEM, its offset is always -1. + * + * So here we only check offset for reloc tree whose key->offset must + * be a valid tree. + */ + if (unlikely(key->objectid == BTRFS_TREE_RELOC_OBJECTID && + key->offset == 0)) { + generic_err(leaf, slot, "invalid root id 0 for reloc tree"); + return -EUCLEAN; + } + return 0; +} + +static int check_dir_item(struct extent_buffer *leaf, + struct btrfs_key *key, struct btrfs_key *prev_key, + int slot) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_dir_item *di; + u32 item_size = btrfs_item_size(leaf, slot); + u32 cur = 0; + + if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) + return -EUCLEAN; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + while (cur < item_size) { + struct btrfs_key location_key; + u32 name_len; + u32 data_len; + u32 max_name_len; + u32 total_size; + u32 name_hash; + u8 dir_type; + int ret; + + /* header itself should not cross item boundary */ + if (unlikely(cur + sizeof(*di) > item_size)) { + dir_item_err(leaf, slot, + "dir item header crosses item boundary, have %zu boundary %u", + cur + sizeof(*di), item_size); + return -EUCLEAN; + } + + /* Location key check */ + btrfs_dir_item_key_to_cpu(leaf, di, &location_key); + if (location_key.type == BTRFS_ROOT_ITEM_KEY) { + ret = check_root_key(leaf, &location_key, slot); + if (unlikely(ret < 0)) + return ret; + } else if (location_key.type == BTRFS_INODE_ITEM_KEY || + location_key.type == 0) { + ret = check_inode_key(leaf, &location_key, slot); + if (unlikely(ret < 0)) + return ret; + } else { + dir_item_err(leaf, slot, + "invalid location key type, have %u, expect %u or %u", + location_key.type, BTRFS_ROOT_ITEM_KEY, + BTRFS_INODE_ITEM_KEY); + return -EUCLEAN; + } + + /* dir type check */ + dir_type = btrfs_dir_ftype(leaf, di); + if (unlikely(dir_type >= BTRFS_FT_MAX)) { + dir_item_err(leaf, slot, + "invalid dir item type, have %u expect [0, %u)", + dir_type, BTRFS_FT_MAX); + return -EUCLEAN; + } + + if (unlikely(key->type == BTRFS_XATTR_ITEM_KEY && + dir_type != BTRFS_FT_XATTR)) { + dir_item_err(leaf, slot, + "invalid dir item type for XATTR key, have %u expect %u", + dir_type, BTRFS_FT_XATTR); + return -EUCLEAN; + } + if (unlikely(dir_type == BTRFS_FT_XATTR && + key->type != BTRFS_XATTR_ITEM_KEY)) { + dir_item_err(leaf, slot, + "xattr dir type found for non-XATTR key"); + return -EUCLEAN; + } + if (dir_type == BTRFS_FT_XATTR) + max_name_len = XATTR_NAME_MAX; + else + max_name_len = BTRFS_NAME_LEN; + + /* Name/data length check */ + name_len = btrfs_dir_name_len(leaf, di); + data_len = btrfs_dir_data_len(leaf, di); + if (unlikely(name_len > max_name_len)) { + dir_item_err(leaf, slot, + "dir item name len too long, have %u max %u", + name_len, max_name_len); + return -EUCLEAN; + } + if (unlikely(name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info))) { + dir_item_err(leaf, slot, + "dir item name and data len too long, have %u max %u", + name_len + data_len, + BTRFS_MAX_XATTR_SIZE(fs_info)); + return -EUCLEAN; + } + + if (unlikely(data_len && dir_type != BTRFS_FT_XATTR)) { + dir_item_err(leaf, slot, + "dir item with invalid data len, have %u expect 0", + data_len); + return -EUCLEAN; + } + + total_size = sizeof(*di) + name_len + data_len; + + /* header and name/data should not cross item boundary */ + if (unlikely(cur + total_size > item_size)) { + dir_item_err(leaf, slot, + "dir item data crosses item boundary, have %u boundary %u", + cur + total_size, item_size); + return -EUCLEAN; + } + + /* + * Special check for XATTR/DIR_ITEM, as key->offset is name + * hash, should match its name + */ + if (key->type == BTRFS_DIR_ITEM_KEY || + key->type == BTRFS_XATTR_ITEM_KEY) { + char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)]; + + read_extent_buffer(leaf, namebuf, + (unsigned long)(di + 1), name_len); + name_hash = btrfs_name_hash(namebuf, name_len); + if (unlikely(key->offset != name_hash)) { + dir_item_err(leaf, slot, + "name hash mismatch with key, have 0x%016x expect 0x%016llx", + name_hash, key->offset); + return -EUCLEAN; + } + } + cur += total_size; + di = (struct btrfs_dir_item *)((void *)di + total_size); + } + return 0; +} + +__printf(3, 4) +__cold +static void block_group_err(const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + const struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(fs_info, + "corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, + key.objectid, key.offset, &vaf); + va_end(args); +} + +static int check_block_group_item(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_block_group_item bgi; + u32 item_size = btrfs_item_size(leaf, slot); + u64 chunk_objectid; + u64 flags; + u64 type; + + /* + * Here we don't really care about alignment since extent allocator can + * handle it. We care more about the size. + */ + if (unlikely(key->offset == 0)) { + block_group_err(leaf, slot, + "invalid block group size 0"); + return -EUCLEAN; + } + + if (unlikely(item_size != sizeof(bgi))) { + block_group_err(leaf, slot, + "invalid item size, have %u expect %zu", + item_size, sizeof(bgi)); + return -EUCLEAN; + } + + read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), + sizeof(bgi)); + chunk_objectid = btrfs_stack_block_group_chunk_objectid(&bgi); + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + /* + * We don't init the nr_global_roots until we load the global + * roots, so this could be 0 at mount time. If it's 0 we'll + * just assume we're fine, and later we'll check against our + * actual value. + */ + if (unlikely(fs_info->nr_global_roots && + chunk_objectid >= fs_info->nr_global_roots)) { + block_group_err(leaf, slot, + "invalid block group global root id, have %llu, needs to be <= %llu", + chunk_objectid, + fs_info->nr_global_roots); + return -EUCLEAN; + } + } else if (unlikely(chunk_objectid != BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { + block_group_err(leaf, slot, + "invalid block group chunk objectid, have %llu expect %llu", + btrfs_stack_block_group_chunk_objectid(&bgi), + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + return -EUCLEAN; + } + + if (unlikely(btrfs_stack_block_group_used(&bgi) > key->offset)) { + block_group_err(leaf, slot, + "invalid block group used, have %llu expect [0, %llu)", + btrfs_stack_block_group_used(&bgi), key->offset); + return -EUCLEAN; + } + + flags = btrfs_stack_block_group_flags(&bgi); + if (unlikely(hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1)) { + block_group_err(leaf, slot, +"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set", + flags & BTRFS_BLOCK_GROUP_PROFILE_MASK, + hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)); + return -EUCLEAN; + } + + type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; + if (unlikely(type != BTRFS_BLOCK_GROUP_DATA && + type != BTRFS_BLOCK_GROUP_METADATA && + type != BTRFS_BLOCK_GROUP_SYSTEM && + type != (BTRFS_BLOCK_GROUP_METADATA | + BTRFS_BLOCK_GROUP_DATA))) { + block_group_err(leaf, slot, +"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx", + type, hweight64(type), + BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA); + return -EUCLEAN; + } + return 0; +} + +__printf(4, 5) +__cold +static void chunk_err(const struct extent_buffer *leaf, + const struct btrfs_chunk *chunk, u64 logical, + const char *fmt, ...) +{ + const struct btrfs_fs_info *fs_info = leaf->fs_info; + bool is_sb; + struct va_format vaf; + va_list args; + int i; + int slot = -1; + + /* Only superblock eb is able to have such small offset */ + is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET); + + if (!is_sb) { + /* + * Get the slot number by iterating through all slots, this + * would provide better readability. + */ + for (i = 0; i < btrfs_header_nritems(leaf); i++) { + if (btrfs_item_ptr_offset(leaf, i) == + (unsigned long)chunk) { + slot = i; + break; + } + } + } + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + if (is_sb) + btrfs_crit(fs_info, + "corrupt superblock syschunk array: chunk_start=%llu, %pV", + logical, &vaf); + else + btrfs_crit(fs_info, + "corrupt leaf: root=%llu block=%llu slot=%d chunk_start=%llu, %pV", + BTRFS_CHUNK_TREE_OBJECTID, leaf->start, slot, + logical, &vaf); + va_end(args); +} + +/* + * The common chunk check which could also work on super block sys chunk array. + * + * Return -EUCLEAN if anything is corrupted. + * Return 0 if everything is OK. + */ +int btrfs_check_chunk_valid(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 logical) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + u64 length; + u64 chunk_end; + u64 stripe_len; + u16 num_stripes; + u16 sub_stripes; + u64 type; + u64 features; + bool mixed = false; + int raid_index; + int nparity; + int ncopies; + + length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); + raid_index = btrfs_bg_flags_to_raid_index(type); + ncopies = btrfs_raid_array[raid_index].ncopies; + nparity = btrfs_raid_array[raid_index].nparity; + + if (unlikely(!num_stripes)) { + chunk_err(leaf, chunk, logical, + "invalid chunk num_stripes, have %u", num_stripes); + return -EUCLEAN; + } + if (unlikely(num_stripes < ncopies)) { + chunk_err(leaf, chunk, logical, + "invalid chunk num_stripes < ncopies, have %u < %d", + num_stripes, ncopies); + return -EUCLEAN; + } + if (unlikely(nparity && num_stripes == nparity)) { + chunk_err(leaf, chunk, logical, + "invalid chunk num_stripes == nparity, have %u == %d", + num_stripes, nparity); + return -EUCLEAN; + } + if (unlikely(!IS_ALIGNED(logical, fs_info->sectorsize))) { + chunk_err(leaf, chunk, logical, + "invalid chunk logical, have %llu should aligned to %u", + logical, fs_info->sectorsize); + return -EUCLEAN; + } + if (unlikely(btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize)) { + chunk_err(leaf, chunk, logical, + "invalid chunk sectorsize, have %u expect %u", + btrfs_chunk_sector_size(leaf, chunk), + fs_info->sectorsize); + return -EUCLEAN; + } + if (unlikely(!length || !IS_ALIGNED(length, fs_info->sectorsize))) { + chunk_err(leaf, chunk, logical, + "invalid chunk length, have %llu", length); + return -EUCLEAN; + } + if (unlikely(check_add_overflow(logical, length, &chunk_end))) { + chunk_err(leaf, chunk, logical, +"invalid chunk logical start and length, have logical start %llu length %llu", + logical, length); + return -EUCLEAN; + } + if (unlikely(!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN)) { + chunk_err(leaf, chunk, logical, + "invalid chunk stripe length: %llu", + stripe_len); + return -EUCLEAN; + } + /* + * We artificially limit the chunk size, so that the number of stripes + * inside a chunk can be fit into a U32. The current limit (256G) is + * way too large for real world usage anyway, and it's also much larger + * than our existing limit (10G). + * + * Thus it should be a good way to catch obvious bitflips. + */ + if (unlikely(length >= btrfs_stripe_nr_to_offset(U32_MAX))) { + chunk_err(leaf, chunk, logical, + "chunk length too large: have %llu limit %llu", + length, btrfs_stripe_nr_to_offset(U32_MAX)); + return -EUCLEAN; + } + if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | + BTRFS_BLOCK_GROUP_PROFILE_MASK))) { + chunk_err(leaf, chunk, logical, + "unrecognized chunk type: 0x%llx", + ~(BTRFS_BLOCK_GROUP_TYPE_MASK | + BTRFS_BLOCK_GROUP_PROFILE_MASK) & + btrfs_chunk_type(leaf, chunk)); + return -EUCLEAN; + } + + if (unlikely(!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && + (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)) { + chunk_err(leaf, chunk, logical, + "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set", + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); + return -EUCLEAN; + } + if (unlikely((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0)) { + chunk_err(leaf, chunk, logical, + "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx", + type, BTRFS_BLOCK_GROUP_TYPE_MASK); + return -EUCLEAN; + } + + if (unlikely((type & BTRFS_BLOCK_GROUP_SYSTEM) && + (type & (BTRFS_BLOCK_GROUP_METADATA | + BTRFS_BLOCK_GROUP_DATA)))) { + chunk_err(leaf, chunk, logical, + "system chunk with data or metadata type: 0x%llx", + type); + return -EUCLEAN; + } + + features = btrfs_super_incompat_flags(fs_info->super_copy); + if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = true; + + if (!mixed) { + if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA) && + (type & BTRFS_BLOCK_GROUP_DATA))) { + chunk_err(leaf, chunk, logical, + "mixed chunk type in non-mixed mode: 0x%llx", type); + return -EUCLEAN; + } + } + + if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && + sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) || + (type & BTRFS_BLOCK_GROUP_RAID1 && + num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) || + (type & BTRFS_BLOCK_GROUP_RAID1C3 && + num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) || + (type & BTRFS_BLOCK_GROUP_RAID1C4 && + num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) || + (type & BTRFS_BLOCK_GROUP_RAID5 && + num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) || + (type & BTRFS_BLOCK_GROUP_RAID6 && + num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) || + (type & BTRFS_BLOCK_GROUP_DUP && + num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) || + ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && + num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) { + chunk_err(leaf, chunk, logical, + "invalid num_stripes:sub_stripes %u:%u for profile %llu", + num_stripes, sub_stripes, + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); + return -EUCLEAN; + } + + return 0; +} + +/* + * Enhanced version of chunk item checker. + * + * The common btrfs_check_chunk_valid() doesn't check item size since it needs + * to work on super block sys_chunk_array which doesn't have full item ptr. + */ +static int check_leaf_chunk_item(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_key *key, int slot) +{ + int num_stripes; + + if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) { + chunk_err(leaf, chunk, key->offset, + "invalid chunk item size: have %u expect [%zu, %u)", + btrfs_item_size(leaf, slot), + sizeof(struct btrfs_chunk), + BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); + return -EUCLEAN; + } + + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + /* Let btrfs_check_chunk_valid() handle this error type */ + if (num_stripes == 0) + goto out; + + if (unlikely(btrfs_chunk_item_size(num_stripes) != + btrfs_item_size(leaf, slot))) { + chunk_err(leaf, chunk, key->offset, + "invalid chunk item size: have %u expect %lu", + btrfs_item_size(leaf, slot), + btrfs_chunk_item_size(num_stripes)); + return -EUCLEAN; + } +out: + return btrfs_check_chunk_valid(leaf, chunk, key->offset); +} + +__printf(3, 4) +__cold +static void dev_item_err(const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(eb->fs_info, + "corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, + key.objectid, &vaf); + va_end(args); +} + +static int check_dev_item(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_dev_item *ditem; + const u32 item_size = btrfs_item_size(leaf, slot); + + if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) { + dev_item_err(leaf, slot, + "invalid objectid: has=%llu expect=%llu", + key->objectid, BTRFS_DEV_ITEMS_OBJECTID); + return -EUCLEAN; + } + + if (unlikely(item_size != sizeof(*ditem))) { + dev_item_err(leaf, slot, "invalid item size: has %u expect %zu", + item_size, sizeof(*ditem)); + return -EUCLEAN; + } + + ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); + if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) { + dev_item_err(leaf, slot, + "devid mismatch: key has=%llu item has=%llu", + key->offset, btrfs_device_id(leaf, ditem)); + return -EUCLEAN; + } + + /* + * For device total_bytes, we don't have reliable way to check it, as + * it can be 0 for device removal. Device size check can only be done + * by dev extents check. + */ + if (unlikely(btrfs_device_bytes_used(leaf, ditem) > + btrfs_device_total_bytes(leaf, ditem))) { + dev_item_err(leaf, slot, + "invalid bytes used: have %llu expect [0, %llu]", + btrfs_device_bytes_used(leaf, ditem), + btrfs_device_total_bytes(leaf, ditem)); + return -EUCLEAN; + } + /* + * Remaining members like io_align/type/gen/dev_group aren't really + * utilized. Skip them to make later usage of them easier. + */ + return 0; +} + +static int check_inode_item(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_inode_item *iitem; + u64 super_gen = btrfs_super_generation(fs_info->super_copy); + u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); + const u32 item_size = btrfs_item_size(leaf, slot); + u32 mode; + int ret; + u32 flags; + u32 ro_flags; + + ret = check_inode_key(leaf, key, slot); + if (unlikely(ret < 0)) + return ret; + + if (unlikely(item_size != sizeof(*iitem))) { + generic_err(leaf, slot, "invalid item size: has %u expect %zu", + item_size, sizeof(*iitem)); + return -EUCLEAN; + } + + iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); + + /* Here we use super block generation + 1 to handle log tree */ + if (unlikely(btrfs_inode_generation(leaf, iitem) > super_gen + 1)) { + inode_item_err(leaf, slot, + "invalid inode generation: has %llu expect (0, %llu]", + btrfs_inode_generation(leaf, iitem), + super_gen + 1); + return -EUCLEAN; + } + /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */ + if (unlikely(btrfs_inode_transid(leaf, iitem) > super_gen + 1)) { + inode_item_err(leaf, slot, + "invalid inode transid: has %llu expect [0, %llu]", + btrfs_inode_transid(leaf, iitem), super_gen + 1); + return -EUCLEAN; + } + + /* + * For size and nbytes it's better not to be too strict, as for dir + * item its size/nbytes can easily get wrong, but doesn't affect + * anything in the fs. So here we skip the check. + */ + mode = btrfs_inode_mode(leaf, iitem); + if (unlikely(mode & ~valid_mask)) { + inode_item_err(leaf, slot, + "unknown mode bit detected: 0x%x", + mode & ~valid_mask); + return -EUCLEAN; + } + + /* + * S_IFMT is not bit mapped so we can't completely rely on + * is_power_of_2/has_single_bit_set, but it can save us from checking + * FIFO/CHR/DIR/REG. Only needs to check BLK, LNK and SOCKS + */ + if (!has_single_bit_set(mode & S_IFMT)) { + if (unlikely(!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode))) { + inode_item_err(leaf, slot, + "invalid mode: has 0%o expect valid S_IF* bit(s)", + mode & S_IFMT); + return -EUCLEAN; + } + } + if (unlikely(S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1)) { + inode_item_err(leaf, slot, + "invalid nlink: has %u expect no more than 1 for dir", + btrfs_inode_nlink(leaf, iitem)); + return -EUCLEAN; + } + btrfs_inode_split_flags(btrfs_inode_flags(leaf, iitem), &flags, &ro_flags); + if (unlikely(flags & ~BTRFS_INODE_FLAG_MASK)) { + inode_item_err(leaf, slot, + "unknown incompat flags detected: 0x%x", flags); + return -EUCLEAN; + } + if (unlikely(!sb_rdonly(fs_info->sb) && + (ro_flags & ~BTRFS_INODE_RO_FLAG_MASK))) { + inode_item_err(leaf, slot, + "unknown ro-compat flags detected on writeable mount: 0x%x", + ro_flags); + return -EUCLEAN; + } + return 0; +} + +static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, + int slot) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_root_item ri = { 0 }; + const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY | + BTRFS_ROOT_SUBVOL_DEAD; + int ret; + + ret = check_root_key(leaf, key, slot); + if (unlikely(ret < 0)) + return ret; + + if (unlikely(btrfs_item_size(leaf, slot) != sizeof(ri) && + btrfs_item_size(leaf, slot) != + btrfs_legacy_root_item_size())) { + generic_err(leaf, slot, + "invalid root item size, have %u expect %zu or %u", + btrfs_item_size(leaf, slot), sizeof(ri), + btrfs_legacy_root_item_size()); + return -EUCLEAN; + } + + /* + * For legacy root item, the members starting at generation_v2 will be + * all filled with 0. + * And since we allow geneartion_v2 as 0, it will still pass the check. + */ + read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), + btrfs_item_size(leaf, slot)); + + /* Generation related */ + if (unlikely(btrfs_root_generation(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1)) { + generic_err(leaf, slot, + "invalid root generation, have %llu expect (0, %llu]", + btrfs_root_generation(&ri), + btrfs_super_generation(fs_info->super_copy) + 1); + return -EUCLEAN; + } + if (unlikely(btrfs_root_generation_v2(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1)) { + generic_err(leaf, slot, + "invalid root v2 generation, have %llu expect (0, %llu]", + btrfs_root_generation_v2(&ri), + btrfs_super_generation(fs_info->super_copy) + 1); + return -EUCLEAN; + } + if (unlikely(btrfs_root_last_snapshot(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1)) { + generic_err(leaf, slot, + "invalid root last_snapshot, have %llu expect (0, %llu]", + btrfs_root_last_snapshot(&ri), + btrfs_super_generation(fs_info->super_copy) + 1); + return -EUCLEAN; + } + + /* Alignment and level check */ + if (unlikely(!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize))) { + generic_err(leaf, slot, + "invalid root bytenr, have %llu expect to be aligned to %u", + btrfs_root_bytenr(&ri), fs_info->sectorsize); + return -EUCLEAN; + } + if (unlikely(btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL)) { + generic_err(leaf, slot, + "invalid root level, have %u expect [0, %u]", + btrfs_root_level(&ri), BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } + if (unlikely(btrfs_root_drop_level(&ri) >= BTRFS_MAX_LEVEL)) { + generic_err(leaf, slot, + "invalid root level, have %u expect [0, %u]", + btrfs_root_drop_level(&ri), BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } + + /* Flags check */ + if (unlikely(btrfs_root_flags(&ri) & ~valid_root_flags)) { + generic_err(leaf, slot, + "invalid root flags, have 0x%llx expect mask 0x%llx", + btrfs_root_flags(&ri), valid_root_flags); + return -EUCLEAN; + } + return 0; +} + +__printf(3,4) +__cold +static void extent_err(const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct btrfs_key key; + struct va_format vaf; + va_list args; + u64 bytenr; + u64 len; + + btrfs_item_key_to_cpu(eb, &key, slot); + bytenr = key.objectid; + if (key.type == BTRFS_METADATA_ITEM_KEY || + key.type == BTRFS_TREE_BLOCK_REF_KEY || + key.type == BTRFS_SHARED_BLOCK_REF_KEY) + len = eb->fs_info->nodesize; + else + len = key.offset; + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(eb->fs_info, + "corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + eb->start, slot, bytenr, len, &vaf); + va_end(args); +} + +static int check_extent_item(struct extent_buffer *leaf, + struct btrfs_key *key, int slot, + struct btrfs_key *prev_key) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_extent_item *ei; + bool is_tree_block = false; + unsigned long ptr; /* Current pointer inside inline refs */ + unsigned long end; /* Extent item end */ + const u32 item_size = btrfs_item_size(leaf, slot); + u64 flags; + u64 generation; + u64 total_refs; /* Total refs in btrfs_extent_item */ + u64 inline_refs = 0; /* found total inline refs */ + + if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY && + !btrfs_fs_incompat(fs_info, SKINNY_METADATA))) { + generic_err(leaf, slot, +"invalid key type, METADATA_ITEM type invalid when SKINNY_METADATA feature disabled"); + return -EUCLEAN; + } + /* key->objectid is the bytenr for both key types */ + if (unlikely(!IS_ALIGNED(key->objectid, fs_info->sectorsize))) { + generic_err(leaf, slot, + "invalid key objectid, have %llu expect to be aligned to %u", + key->objectid, fs_info->sectorsize); + return -EUCLEAN; + } + + /* key->offset is tree level for METADATA_ITEM_KEY */ + if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY && + key->offset >= BTRFS_MAX_LEVEL)) { + extent_err(leaf, slot, + "invalid tree level, have %llu expect [0, %u]", + key->offset, BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } + + /* + * EXTENT/METADATA_ITEM consists of: + * 1) One btrfs_extent_item + * Records the total refs, type and generation of the extent. + * + * 2) One btrfs_tree_block_info (for EXTENT_ITEM and tree backref only) + * Records the first key and level of the tree block. + * + * 2) Zero or more btrfs_extent_inline_ref(s) + * Each inline ref has one btrfs_extent_inline_ref shows: + * 2.1) The ref type, one of the 4 + * TREE_BLOCK_REF Tree block only + * SHARED_BLOCK_REF Tree block only + * EXTENT_DATA_REF Data only + * SHARED_DATA_REF Data only + * 2.2) Ref type specific data + * Either using btrfs_extent_inline_ref::offset, or specific + * data structure. + */ + if (unlikely(item_size < sizeof(*ei))) { + extent_err(leaf, slot, + "invalid item size, have %u expect [%zu, %u)", + item_size, sizeof(*ei), + BTRFS_LEAF_DATA_SIZE(fs_info)); + return -EUCLEAN; + } + end = item_size + btrfs_item_ptr_offset(leaf, slot); + + /* Checks against extent_item */ + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + total_refs = btrfs_extent_refs(leaf, ei); + generation = btrfs_extent_generation(leaf, ei); + if (unlikely(generation > + btrfs_super_generation(fs_info->super_copy) + 1)) { + extent_err(leaf, slot, + "invalid generation, have %llu expect (0, %llu]", + generation, + btrfs_super_generation(fs_info->super_copy) + 1); + return -EUCLEAN; + } + if (unlikely(!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA | + BTRFS_EXTENT_FLAG_TREE_BLOCK)))) { + extent_err(leaf, slot, + "invalid extent flag, have 0x%llx expect 1 bit set in 0x%llx", + flags, BTRFS_EXTENT_FLAG_DATA | + BTRFS_EXTENT_FLAG_TREE_BLOCK); + return -EUCLEAN; + } + is_tree_block = !!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK); + if (is_tree_block) { + if (unlikely(key->type == BTRFS_EXTENT_ITEM_KEY && + key->offset != fs_info->nodesize)) { + extent_err(leaf, slot, + "invalid extent length, have %llu expect %u", + key->offset, fs_info->nodesize); + return -EUCLEAN; + } + } else { + if (unlikely(key->type != BTRFS_EXTENT_ITEM_KEY)) { + extent_err(leaf, slot, + "invalid key type, have %u expect %u for data backref", + key->type, BTRFS_EXTENT_ITEM_KEY); + return -EUCLEAN; + } + if (unlikely(!IS_ALIGNED(key->offset, fs_info->sectorsize))) { + extent_err(leaf, slot, + "invalid extent length, have %llu expect aligned to %u", + key->offset, fs_info->sectorsize); + return -EUCLEAN; + } + if (unlikely(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { + extent_err(leaf, slot, + "invalid extent flag, data has full backref set"); + return -EUCLEAN; + } + } + ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1); + + /* Check the special case of btrfs_tree_block_info */ + if (is_tree_block && key->type != BTRFS_METADATA_ITEM_KEY) { + struct btrfs_tree_block_info *info; + + info = (struct btrfs_tree_block_info *)ptr; + if (unlikely(btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL)) { + extent_err(leaf, slot, + "invalid tree block info level, have %u expect [0, %u]", + btrfs_tree_block_level(leaf, info), + BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } + ptr = (unsigned long)(struct btrfs_tree_block_info *)(info + 1); + } + + /* Check inline refs */ + while (ptr < end) { + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + u64 dref_offset; + u64 inline_offset; + u8 inline_type; + + if (unlikely(ptr + sizeof(*iref) > end)) { + extent_err(leaf, slot, +"inline ref item overflows extent item, ptr %lu iref size %zu end %lu", + ptr, sizeof(*iref), end); + return -EUCLEAN; + } + iref = (struct btrfs_extent_inline_ref *)ptr; + inline_type = btrfs_extent_inline_ref_type(leaf, iref); + inline_offset = btrfs_extent_inline_ref_offset(leaf, iref); + if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) { + extent_err(leaf, slot, +"inline ref item overflows extent item, ptr %lu iref size %u end %lu", + ptr, btrfs_extent_inline_ref_size(inline_type), end); + return -EUCLEAN; + } + + switch (inline_type) { + /* inline_offset is subvolid of the owner, no need to check */ + case BTRFS_TREE_BLOCK_REF_KEY: + inline_refs++; + break; + /* Contains parent bytenr */ + case BTRFS_SHARED_BLOCK_REF_KEY: + if (unlikely(!IS_ALIGNED(inline_offset, + fs_info->sectorsize))) { + extent_err(leaf, slot, + "invalid tree parent bytenr, have %llu expect aligned to %u", + inline_offset, fs_info->sectorsize); + return -EUCLEAN; + } + inline_refs++; + break; + /* + * Contains owner subvolid, owner key objectid, adjusted offset. + * The only obvious corruption can happen in that offset. + */ + case BTRFS_EXTENT_DATA_REF_KEY: + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + dref_offset = btrfs_extent_data_ref_offset(leaf, dref); + if (unlikely(!IS_ALIGNED(dref_offset, + fs_info->sectorsize))) { + extent_err(leaf, slot, + "invalid data ref offset, have %llu expect aligned to %u", + dref_offset, fs_info->sectorsize); + return -EUCLEAN; + } + inline_refs += btrfs_extent_data_ref_count(leaf, dref); + break; + /* Contains parent bytenr and ref count */ + case BTRFS_SHARED_DATA_REF_KEY: + sref = (struct btrfs_shared_data_ref *)(iref + 1); + if (unlikely(!IS_ALIGNED(inline_offset, + fs_info->sectorsize))) { + extent_err(leaf, slot, + "invalid data parent bytenr, have %llu expect aligned to %u", + inline_offset, fs_info->sectorsize); + return -EUCLEAN; + } + inline_refs += btrfs_shared_data_ref_count(leaf, sref); + break; + default: + extent_err(leaf, slot, "unknown inline ref type: %u", + inline_type); + return -EUCLEAN; + } + ptr += btrfs_extent_inline_ref_size(inline_type); + } + /* No padding is allowed */ + if (unlikely(ptr != end)) { + extent_err(leaf, slot, + "invalid extent item size, padding bytes found"); + return -EUCLEAN; + } + + /* Finally, check the inline refs against total refs */ + if (unlikely(inline_refs > total_refs)) { + extent_err(leaf, slot, + "invalid extent refs, have %llu expect >= inline %llu", + total_refs, inline_refs); + return -EUCLEAN; + } + + if ((prev_key->type == BTRFS_EXTENT_ITEM_KEY) || + (prev_key->type == BTRFS_METADATA_ITEM_KEY)) { + u64 prev_end = prev_key->objectid; + + if (prev_key->type == BTRFS_METADATA_ITEM_KEY) + prev_end += fs_info->nodesize; + else + prev_end += prev_key->offset; + + if (unlikely(prev_end > key->objectid)) { + extent_err(leaf, slot, + "previous extent [%llu %u %llu] overlaps current extent [%llu %u %llu]", + prev_key->objectid, prev_key->type, + prev_key->offset, key->objectid, key->type, + key->offset); + return -EUCLEAN; + } + } + + return 0; +} + +static int check_simple_keyed_refs(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + u32 expect_item_size = 0; + + if (key->type == BTRFS_SHARED_DATA_REF_KEY) + expect_item_size = sizeof(struct btrfs_shared_data_ref); + + if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) { + generic_err(leaf, slot, + "invalid item size, have %u expect %u for key type %u", + btrfs_item_size(leaf, slot), + expect_item_size, key->type); + return -EUCLEAN; + } + if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) { + generic_err(leaf, slot, +"invalid key objectid for shared block ref, have %llu expect aligned to %u", + key->objectid, leaf->fs_info->sectorsize); + return -EUCLEAN; + } + if (unlikely(key->type != BTRFS_TREE_BLOCK_REF_KEY && + !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize))) { + extent_err(leaf, slot, + "invalid tree parent bytenr, have %llu expect aligned to %u", + key->offset, leaf->fs_info->sectorsize); + return -EUCLEAN; + } + return 0; +} + +static int check_extent_data_ref(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_extent_data_ref *dref; + unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); + const unsigned long end = ptr + btrfs_item_size(leaf, slot); + + if (unlikely(btrfs_item_size(leaf, slot) % sizeof(*dref) != 0)) { + generic_err(leaf, slot, + "invalid item size, have %u expect aligned to %zu for key type %u", + btrfs_item_size(leaf, slot), + sizeof(*dref), key->type); + return -EUCLEAN; + } + if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) { + generic_err(leaf, slot, +"invalid key objectid for shared block ref, have %llu expect aligned to %u", + key->objectid, leaf->fs_info->sectorsize); + return -EUCLEAN; + } + for (; ptr < end; ptr += sizeof(*dref)) { + u64 offset; + + /* + * We cannot check the extent_data_ref hash due to possible + * overflow from the leaf due to hash collisions. + */ + dref = (struct btrfs_extent_data_ref *)ptr; + offset = btrfs_extent_data_ref_offset(leaf, dref); + if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) { + extent_err(leaf, slot, + "invalid extent data backref offset, have %llu expect aligned to %u", + offset, leaf->fs_info->sectorsize); + return -EUCLEAN; + } + } + return 0; +} + +#define inode_ref_err(eb, slot, fmt, args...) \ + inode_item_err(eb, slot, fmt, ##args) +static int check_inode_ref(struct extent_buffer *leaf, + struct btrfs_key *key, struct btrfs_key *prev_key, + int slot) +{ + struct btrfs_inode_ref *iref; + unsigned long ptr; + unsigned long end; + + if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) + return -EUCLEAN; + /* namelen can't be 0, so item_size == sizeof() is also invalid */ + if (unlikely(btrfs_item_size(leaf, slot) <= sizeof(*iref))) { + inode_ref_err(leaf, slot, + "invalid item size, have %u expect (%zu, %u)", + btrfs_item_size(leaf, slot), + sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); + return -EUCLEAN; + } + + ptr = btrfs_item_ptr_offset(leaf, slot); + end = ptr + btrfs_item_size(leaf, slot); + while (ptr < end) { + u16 namelen; + + if (unlikely(ptr + sizeof(iref) > end)) { + inode_ref_err(leaf, slot, + "inode ref overflow, ptr %lu end %lu inode_ref_size %zu", + ptr, end, sizeof(iref)); + return -EUCLEAN; + } + + iref = (struct btrfs_inode_ref *)ptr; + namelen = btrfs_inode_ref_name_len(leaf, iref); + if (unlikely(ptr + sizeof(*iref) + namelen > end)) { + inode_ref_err(leaf, slot, + "inode ref overflow, ptr %lu end %lu namelen %u", + ptr, end, namelen); + return -EUCLEAN; + } + + /* + * NOTE: In theory we should record all found index numbers + * to find any duplicated indexes, but that will be too time + * consuming for inodes with too many hard links. + */ + ptr += sizeof(*iref) + namelen; + } + return 0; +} + +/* + * Common point to switch the item-specific validation. + */ +static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, + struct btrfs_key *key, + int slot, + struct btrfs_key *prev_key) +{ + int ret = 0; + struct btrfs_chunk *chunk; + + switch (key->type) { + case BTRFS_EXTENT_DATA_KEY: + ret = check_extent_data_item(leaf, key, slot, prev_key); + break; + case BTRFS_EXTENT_CSUM_KEY: + ret = check_csum_item(leaf, key, slot, prev_key); + break; + case BTRFS_DIR_ITEM_KEY: + case BTRFS_DIR_INDEX_KEY: + case BTRFS_XATTR_ITEM_KEY: + ret = check_dir_item(leaf, key, prev_key, slot); + break; + case BTRFS_INODE_REF_KEY: + ret = check_inode_ref(leaf, key, prev_key, slot); + break; + case BTRFS_BLOCK_GROUP_ITEM_KEY: + ret = check_block_group_item(leaf, key, slot); + break; + case BTRFS_CHUNK_ITEM_KEY: + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + ret = check_leaf_chunk_item(leaf, chunk, key, slot); + break; + case BTRFS_DEV_ITEM_KEY: + ret = check_dev_item(leaf, key, slot); + break; + case BTRFS_INODE_ITEM_KEY: + ret = check_inode_item(leaf, key, slot); + break; + case BTRFS_ROOT_ITEM_KEY: + ret = check_root_item(leaf, key, slot); + break; + case BTRFS_EXTENT_ITEM_KEY: + case BTRFS_METADATA_ITEM_KEY: + ret = check_extent_item(leaf, key, slot, prev_key); + break; + case BTRFS_TREE_BLOCK_REF_KEY: + case BTRFS_SHARED_DATA_REF_KEY: + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = check_simple_keyed_refs(leaf, key, slot); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + ret = check_extent_data_ref(leaf, key, slot); + break; + } + + if (ret) + return BTRFS_TREE_BLOCK_INVALID_ITEM; + return BTRFS_TREE_BLOCK_CLEAN; +} + +enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + /* No valid key type is 0, so all key should be larger than this key */ + struct btrfs_key prev_key = {0, 0, 0}; + struct btrfs_key key; + u32 nritems = btrfs_header_nritems(leaf); + int slot; + + if (unlikely(btrfs_header_level(leaf) != 0)) { + generic_err(leaf, 0, + "invalid level for leaf, have %d expect 0", + btrfs_header_level(leaf)); + return BTRFS_TREE_BLOCK_INVALID_LEVEL; + } + + /* + * Extent buffers from a relocation tree have a owner field that + * corresponds to the subvolume tree they are based on. So just from an + * extent buffer alone we can not find out what is the id of the + * corresponding subvolume tree, so we can not figure out if the extent + * buffer corresponds to the root of the relocation tree or not. So + * skip this check for relocation trees. + */ + if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { + u64 owner = btrfs_header_owner(leaf); + + /* These trees must never be empty */ + if (unlikely(owner == BTRFS_ROOT_TREE_OBJECTID || + owner == BTRFS_CHUNK_TREE_OBJECTID || + owner == BTRFS_DEV_TREE_OBJECTID || + owner == BTRFS_FS_TREE_OBJECTID || + owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { + generic_err(leaf, 0, + "invalid root, root %llu must never be empty", + owner); + return BTRFS_TREE_BLOCK_INVALID_NRITEMS; + } + + /* Unknown tree */ + if (unlikely(owner == 0)) { + generic_err(leaf, 0, + "invalid owner, root 0 is not defined"); + return BTRFS_TREE_BLOCK_INVALID_OWNER; + } + + /* EXTENT_TREE_V2 can have empty extent trees. */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return BTRFS_TREE_BLOCK_CLEAN; + + if (unlikely(owner == BTRFS_EXTENT_TREE_OBJECTID)) { + generic_err(leaf, 0, + "invalid root, root %llu must never be empty", + owner); + return BTRFS_TREE_BLOCK_INVALID_NRITEMS; + } + + return BTRFS_TREE_BLOCK_CLEAN; + } + + if (unlikely(nritems == 0)) + return BTRFS_TREE_BLOCK_CLEAN; + + /* + * Check the following things to make sure this is a good leaf, and + * leaf users won't need to bother with similar sanity checks: + * + * 1) key ordering + * 2) item offset and size + * No overlap, no hole, all inside the leaf. + * 3) item content + * If possible, do comprehensive sanity check. + * NOTE: All checks must only rely on the item data itself. + */ + for (slot = 0; slot < nritems; slot++) { + u32 item_end_expected; + u64 item_data_end; + + btrfs_item_key_to_cpu(leaf, &key, slot); + + /* Make sure the keys are in the right order */ + if (unlikely(btrfs_comp_cpu_keys(&prev_key, &key) >= 0)) { + generic_err(leaf, slot, + "bad key order, prev (%llu %u %llu) current (%llu %u %llu)", + prev_key.objectid, prev_key.type, + prev_key.offset, key.objectid, key.type, + key.offset); + return BTRFS_TREE_BLOCK_BAD_KEY_ORDER; + } + + item_data_end = (u64)btrfs_item_offset(leaf, slot) + + btrfs_item_size(leaf, slot); + /* + * Make sure the offset and ends are right, remember that the + * item data starts at the end of the leaf and grows towards the + * front. + */ + if (slot == 0) + item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info); + else + item_end_expected = btrfs_item_offset(leaf, + slot - 1); + if (unlikely(item_data_end != item_end_expected)) { + generic_err(leaf, slot, + "unexpected item end, have %llu expect %u", + item_data_end, item_end_expected); + return BTRFS_TREE_BLOCK_INVALID_OFFSETS; + } + + /* + * Check to make sure that we don't point outside of the leaf, + * just in case all the items are consistent to each other, but + * all point outside of the leaf. + */ + if (unlikely(item_data_end > BTRFS_LEAF_DATA_SIZE(fs_info))) { + generic_err(leaf, slot, + "slot end outside of leaf, have %llu expect range [0, %u]", + item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info)); + return BTRFS_TREE_BLOCK_INVALID_OFFSETS; + } + + /* Also check if the item pointer overlaps with btrfs item. */ + if (unlikely(btrfs_item_ptr_offset(leaf, slot) < + btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item))) { + generic_err(leaf, slot, + "slot overlaps with its data, item end %lu data start %lu", + btrfs_item_nr_offset(leaf, slot) + + sizeof(struct btrfs_item), + btrfs_item_ptr_offset(leaf, slot)); + return BTRFS_TREE_BLOCK_INVALID_OFFSETS; + } + + /* + * We only want to do this if WRITTEN is set, otherwise the leaf + * may be in some intermediate state and won't appear valid. + */ + if (btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN)) { + enum btrfs_tree_block_status ret; + + /* + * Check if the item size and content meet other + * criteria + */ + ret = check_leaf_item(leaf, &key, slot, &prev_key); + if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) + return ret; + } + + prev_key.objectid = key.objectid; + prev_key.type = key.type; + prev_key.offset = key.offset; + } + + return BTRFS_TREE_BLOCK_CLEAN; +} + +int btrfs_check_leaf(struct extent_buffer *leaf) +{ + enum btrfs_tree_block_status ret; + + ret = __btrfs_check_leaf(leaf); + if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) + return -EUCLEAN; + return 0; +} +ALLOW_ERROR_INJECTION(btrfs_check_leaf, ERRNO); + +enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node) +{ + struct btrfs_fs_info *fs_info = node->fs_info; + unsigned long nr = btrfs_header_nritems(node); + struct btrfs_key key, next_key; + int slot; + int level = btrfs_header_level(node); + u64 bytenr; + + if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) { + generic_err(node, 0, + "invalid level for node, have %d expect [1, %d]", + level, BTRFS_MAX_LEVEL - 1); + return BTRFS_TREE_BLOCK_INVALID_LEVEL; + } + if (unlikely(nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info))) { + btrfs_crit(fs_info, +"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]", + btrfs_header_owner(node), node->start, + nr == 0 ? "small" : "large", nr, + BTRFS_NODEPTRS_PER_BLOCK(fs_info)); + return BTRFS_TREE_BLOCK_INVALID_NRITEMS; + } + + for (slot = 0; slot < nr - 1; slot++) { + bytenr = btrfs_node_blockptr(node, slot); + btrfs_node_key_to_cpu(node, &key, slot); + btrfs_node_key_to_cpu(node, &next_key, slot + 1); + + if (unlikely(!bytenr)) { + generic_err(node, slot, + "invalid NULL node pointer"); + return BTRFS_TREE_BLOCK_INVALID_BLOCKPTR; + } + if (unlikely(!IS_ALIGNED(bytenr, fs_info->sectorsize))) { + generic_err(node, slot, + "unaligned pointer, have %llu should be aligned to %u", + bytenr, fs_info->sectorsize); + return BTRFS_TREE_BLOCK_INVALID_BLOCKPTR; + } + + if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) { + generic_err(node, slot, + "bad key order, current (%llu %u %llu) next (%llu %u %llu)", + key.objectid, key.type, key.offset, + next_key.objectid, next_key.type, + next_key.offset); + return BTRFS_TREE_BLOCK_BAD_KEY_ORDER; + } + } + return BTRFS_TREE_BLOCK_CLEAN; +} + +int btrfs_check_node(struct extent_buffer *node) +{ + enum btrfs_tree_block_status ret; + + ret = __btrfs_check_node(node); + if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) + return -EUCLEAN; + return 0; +} +ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO); + +int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) +{ + const bool is_subvol = is_fstree(root_owner); + const u64 eb_owner = btrfs_header_owner(eb); + + /* + * Skip dummy fs, as selftests don't create unique ebs for each dummy + * root. + */ + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state)) + return 0; + /* + * There are several call sites (backref walking, qgroup, and data + * reloc) passing 0 as @root_owner, as they are not holding the + * tree root. In that case, we can not do a reliable ownership check, + * so just exit. + */ + if (root_owner == 0) + return 0; + /* + * These trees use key.offset as their owner, our callers don't have + * the extra capacity to pass key.offset here. So we just skip them. + */ + if (root_owner == BTRFS_TREE_LOG_OBJECTID || + root_owner == BTRFS_TREE_RELOC_OBJECTID) + return 0; + + if (!is_subvol) { + /* For non-subvolume trees, the eb owner should match root owner */ + if (unlikely(root_owner != eb_owner)) { + btrfs_crit(eb->fs_info, +"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect %llu", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + root_owner, btrfs_header_bytenr(eb), eb_owner, + root_owner); + return -EUCLEAN; + } + return 0; + } + + /* + * For subvolume trees, owners can mismatch, but they should all belong + * to subvolume trees. + */ + if (unlikely(is_subvol != is_fstree(eb_owner))) { + btrfs_crit(eb->fs_info, +"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + root_owner, btrfs_header_bytenr(eb), eb_owner, + BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID); + return -EUCLEAN; + } + return 0; +} + +int btrfs_verify_level_key(struct extent_buffer *eb, int level, + struct btrfs_key *first_key, u64 parent_transid) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + int found_level; + struct btrfs_key found_key; + int ret; + + found_level = btrfs_header_level(eb); + if (found_level != level) { + WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), + KERN_ERR "BTRFS: tree level check failed\n"); + btrfs_err(fs_info, +"tree level mismatch detected, bytenr=%llu level expected=%u has=%u", + eb->start, level, found_level); + return -EIO; + } + + if (!first_key) + return 0; + + /* + * For live tree block (new tree blocks in current transaction), + * we need proper lock context to avoid race, which is impossible here. + * So we only checks tree blocks which is read from disk, whose + * generation <= fs_info->last_trans_committed. + */ + if (btrfs_header_generation(eb) > fs_info->last_trans_committed) + return 0; + + /* We have @first_key, so this @eb must have at least one item */ + if (btrfs_header_nritems(eb) == 0) { + btrfs_err(fs_info, + "invalid tree nritems, bytenr=%llu nritems=0 expect >0", + eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + return -EUCLEAN; + } + + if (found_level) + btrfs_node_key_to_cpu(eb, &found_key, 0); + else + btrfs_item_key_to_cpu(eb, &found_key, 0); + ret = btrfs_comp_cpu_keys(first_key, &found_key); + + if (ret) { + WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), + KERN_ERR "BTRFS: tree first key check failed\n"); + btrfs_err(fs_info, +"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", + eb->start, parent_transid, first_key->objectid, + first_key->type, first_key->offset, + found_key.objectid, found_key.type, + found_key.offset); + } + return ret; +} diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h new file mode 100644 index 0000000000..3c2a02a72f --- /dev/null +++ b/fs/btrfs/tree-checker.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) Qu Wenruo 2017. All rights reserved. + */ + +#ifndef BTRFS_TREE_CHECKER_H +#define BTRFS_TREE_CHECKER_H + +#include + +struct extent_buffer; +struct btrfs_chunk; + +/* All the extra info needed to verify the parentness of a tree block. */ +struct btrfs_tree_parent_check { + /* + * The owner check against the tree block. + * + * Can be 0 to skip the owner check. + */ + u64 owner_root; + + /* + * Expected transid, can be 0 to skip the check, but such skip + * should only be utlized for backref walk related code. + */ + u64 transid; + + /* + * The expected first key. + * + * This check can be skipped if @has_first_key is false, such skip + * can happen for case where we don't have the parent node key, + * e.g. reading the tree root, doing backref walk. + */ + struct btrfs_key first_key; + bool has_first_key; + + /* The expected level. Should always be set. */ + u8 level; +}; + +enum btrfs_tree_block_status { + BTRFS_TREE_BLOCK_CLEAN, + BTRFS_TREE_BLOCK_INVALID_NRITEMS, + BTRFS_TREE_BLOCK_INVALID_PARENT_KEY, + BTRFS_TREE_BLOCK_BAD_KEY_ORDER, + BTRFS_TREE_BLOCK_INVALID_LEVEL, + BTRFS_TREE_BLOCK_INVALID_FREE_SPACE, + BTRFS_TREE_BLOCK_INVALID_OFFSETS, + BTRFS_TREE_BLOCK_INVALID_BLOCKPTR, + BTRFS_TREE_BLOCK_INVALID_ITEM, + BTRFS_TREE_BLOCK_INVALID_OWNER, +}; + +/* + * Exported simply for btrfs-progs which wants to have the + * btrfs_tree_block_status return codes. + */ +enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf); +enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node); + +int btrfs_check_leaf(struct extent_buffer *leaf); +int btrfs_check_node(struct extent_buffer *node); + +int btrfs_check_chunk_valid(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 logical); +int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); +int btrfs_verify_level_key(struct extent_buffer *eb, int level, + struct btrfs_key *first_key, u64 parent_transid); + +#endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c new file mode 100644 index 0000000000..9fb64af608 --- /dev/null +++ b/fs/btrfs/tree-log.c @@ -0,0 +1,7534 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2008 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include "misc.h" +#include "ctree.h" +#include "tree-log.h" +#include "disk-io.h" +#include "locking.h" +#include "print-tree.h" +#include "backref.h" +#include "compression.h" +#include "qgroup.h" +#include "block-group.h" +#include "space-info.h" +#include "zoned.h" +#include "inode-item.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "dir-item.h" +#include "file-item.h" +#include "file.h" +#include "orphan.h" +#include "tree-checker.h" + +#define MAX_CONFLICT_INODES 10 + +/* magic values for the inode_only field in btrfs_log_inode: + * + * LOG_INODE_ALL means to log everything + * LOG_INODE_EXISTS means to log just enough to recreate the inode + * during log replay + */ +enum { + LOG_INODE_ALL, + LOG_INODE_EXISTS, +}; + +/* + * directory trouble cases + * + * 1) on rename or unlink, if the inode being unlinked isn't in the fsync + * log, we must force a full commit before doing an fsync of the directory + * where the unlink was done. + * ---> record transid of last unlink/rename per directory + * + * mkdir foo/some_dir + * normal commit + * rename foo/some_dir foo2/some_dir + * mkdir foo/some_dir + * fsync foo/some_dir/some_file + * + * The fsync above will unlink the original some_dir without recording + * it in its new location (foo2). After a crash, some_dir will be gone + * unless the fsync of some_file forces a full commit + * + * 2) we must log any new names for any file or dir that is in the fsync + * log. ---> check inode while renaming/linking. + * + * 2a) we must log any new names for any file or dir during rename + * when the directory they are being removed from was logged. + * ---> check inode and old parent dir during rename + * + * 2a is actually the more important variant. With the extra logging + * a crash might unlink the old name without recreating the new one + * + * 3) after a crash, we must go through any directories with a link count + * of zero and redo the rm -rf + * + * mkdir f1/foo + * normal commit + * rm -rf f1/foo + * fsync(f1) + * + * The directory f1 was fully removed from the FS, but fsync was never + * called on f1, only its parent dir. After a crash the rm -rf must + * be replayed. This must be able to recurse down the entire + * directory tree. The inode link count fixup code takes care of the + * ugly details. + */ + +/* + * stages for the tree walking. The first + * stage (0) is to only pin down the blocks we find + * the second stage (1) is to make sure that all the inodes + * we find in the log are created in the subvolume. + * + * The last stage is to deal with directories and links and extents + * and all the other fun semantics + */ +enum { + LOG_WALK_PIN_ONLY, + LOG_WALK_REPLAY_INODES, + LOG_WALK_REPLAY_DIR_INDEX, + LOG_WALK_REPLAY_ALL, +}; + +static int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + int inode_only, + struct btrfs_log_ctx *ctx); +static int link_to_fixup_dir(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); +static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dirid, int del_all); +static void wait_log_commit(struct btrfs_root *root, int transid); + +/* + * tree logging is a special write ahead log used to make sure that + * fsyncs and O_SYNCs can happen without doing full tree commits. + * + * Full tree commits are expensive because they require commonly + * modified blocks to be recowed, creating many dirty pages in the + * extent tree an 4x-6x higher write load than ext3. + * + * Instead of doing a tree commit on every fsync, we use the + * key ranges and transaction ids to find items for a given file or directory + * that have changed in this transaction. Those items are copied into + * a special tree (one per subvolume root), that tree is written to disk + * and then the fsync is considered complete. + * + * After a crash, items are copied out of the log-tree back into the + * subvolume tree. Any file data extents found are recorded in the extent + * allocation tree, and the log-tree freed. + * + * The log tree is read three times, once to pin down all the extents it is + * using in ram and once, once to create all the inodes logged in the tree + * and once to do all the other items. + */ + +/* + * start a sub transaction and setup the log tree + * this increments the log tree writer count to make the people + * syncing the tree wait for us to finish + */ +static int start_log_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *tree_root = fs_info->tree_root; + const bool zoned = btrfs_is_zoned(fs_info); + int ret = 0; + bool created = false; + + /* + * First check if the log root tree was already created. If not, create + * it before locking the root's log_mutex, just to keep lockdep happy. + */ + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) { + mutex_lock(&tree_root->log_mutex); + if (!fs_info->log_root_tree) { + ret = btrfs_init_log_root_tree(trans, fs_info); + if (!ret) { + set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); + created = true; + } + } + mutex_unlock(&tree_root->log_mutex); + if (ret) + return ret; + } + + mutex_lock(&root->log_mutex); + +again: + if (root->log_root) { + int index = (root->log_transid + 1) % 2; + + if (btrfs_need_log_full_commit(trans)) { + ret = BTRFS_LOG_FORCE_COMMIT; + goto out; + } + + if (zoned && atomic_read(&root->log_commit[index])) { + wait_log_commit(root, root->log_transid - 1); + goto again; + } + + if (!root->log_start_pid) { + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); + root->log_start_pid = current->pid; + } else if (root->log_start_pid != current->pid) { + set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); + } + } else { + /* + * This means fs_info->log_root_tree was already created + * for some other FS trees. Do the full commit not to mix + * nodes from multiple log transactions to do sequential + * writing. + */ + if (zoned && !created) { + ret = BTRFS_LOG_FORCE_COMMIT; + goto out; + } + + ret = btrfs_add_log_tree(trans, root); + if (ret) + goto out; + + set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); + root->log_start_pid = current->pid; + } + + atomic_inc(&root->log_writers); + if (!ctx->logging_new_name) { + int index = root->log_transid % 2; + list_add_tail(&ctx->list, &root->log_ctxs[index]); + ctx->log_transid = root->log_transid; + } + +out: + mutex_unlock(&root->log_mutex); + return ret; +} + +/* + * returns 0 if there was a log transaction running and we were able + * to join, or returns -ENOENT if there were not transactions + * in progress + */ +static int join_running_log_trans(struct btrfs_root *root) +{ + const bool zoned = btrfs_is_zoned(root->fs_info); + int ret = -ENOENT; + + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) + return ret; + + mutex_lock(&root->log_mutex); +again: + if (root->log_root) { + int index = (root->log_transid + 1) % 2; + + ret = 0; + if (zoned && atomic_read(&root->log_commit[index])) { + wait_log_commit(root, root->log_transid - 1); + goto again; + } + atomic_inc(&root->log_writers); + } + mutex_unlock(&root->log_mutex); + return ret; +} + +/* + * This either makes the current running log transaction wait + * until you call btrfs_end_log_trans() or it makes any future + * log transactions wait until you call btrfs_end_log_trans() + */ +void btrfs_pin_log_trans(struct btrfs_root *root) +{ + atomic_inc(&root->log_writers); +} + +/* + * indicate we're done making changes to the log tree + * and wake up anyone waiting to do a sync + */ +void btrfs_end_log_trans(struct btrfs_root *root) +{ + if (atomic_dec_and_test(&root->log_writers)) { + /* atomic_dec_and_test implies a barrier */ + cond_wake_up_nomb(&root->log_writer_wait); + } +} + +/* + * the walk control struct is used to pass state down the chain when + * processing the log tree. The stage field tells us which part + * of the log tree processing we are currently doing. The others + * are state fields used for that specific part + */ +struct walk_control { + /* should we free the extent on disk when done? This is used + * at transaction commit time while freeing a log tree + */ + int free; + + /* pin only walk, we record which extents on disk belong to the + * log trees + */ + int pin; + + /* what stage of the replay code we're currently in */ + int stage; + + /* + * Ignore any items from the inode currently being processed. Needs + * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in + * the LOG_WALK_REPLAY_INODES stage. + */ + bool ignore_cur_inode; + + /* the root we are currently replaying */ + struct btrfs_root *replay_dest; + + /* the trans handle for the current replay */ + struct btrfs_trans_handle *trans; + + /* the function that gets used to process blocks we find in the + * tree. Note the extent_buffer might not be up to date when it is + * passed in, and it must be checked or read if you need the data + * inside it + */ + int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, + struct walk_control *wc, u64 gen, int level); +}; + +/* + * process_func used to pin down extents, write them or wait on them + */ +static int process_one_buffer(struct btrfs_root *log, + struct extent_buffer *eb, + struct walk_control *wc, u64 gen, int level) +{ + struct btrfs_fs_info *fs_info = log->fs_info; + int ret = 0; + + /* + * If this fs is mixed then we need to be able to process the leaves to + * pin down any logged extents, so we have to read the block. + */ + if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { + struct btrfs_tree_parent_check check = { + .level = level, + .transid = gen + }; + + ret = btrfs_read_extent_buffer(eb, &check); + if (ret) + return ret; + } + + if (wc->pin) { + ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, + eb->len); + if (ret) + return ret; + + if (btrfs_buffer_uptodate(eb, gen, 0) && + btrfs_header_level(eb) == 0) + ret = btrfs_exclude_logged_extents(eb); + } + return ret; +} + +/* + * Item overwrite used by replay and tree logging. eb, slot and key all refer + * to the src data we are copying out. + * + * root is the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and + * will be released on exit). + * + * If the key is already in the destination tree the existing item is + * overwritten. If the existing item isn't big enough, it is extended. + * If it is too large, it is truncated. + * + * If the key isn't in the destination yet, a new item is inserted. + */ +static int overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + u32 item_size; + u64 saved_i_size = 0; + int save_old_i_size = 0; + unsigned long src_ptr; + unsigned long dst_ptr; + bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; + + /* + * This is only used during log replay, so the root is always from a + * fs/subvolume tree. In case we ever need to support a log root, then + * we'll have to clone the leaf in the path, release the path and use + * the leaf before writing into the log tree. See the comments at + * copy_items() for more details. + */ + ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); + + item_size = btrfs_item_size(eb, slot); + src_ptr = btrfs_item_ptr_offset(eb, slot); + + /* Look for the key in the destination tree. */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + return ret; + + if (ret == 0) { + char *src_copy; + char *dst_copy; + u32 dst_size = btrfs_item_size(path->nodes[0], + path->slots[0]); + if (dst_size != item_size) + goto insert; + + if (item_size == 0) { + btrfs_release_path(path); + return 0; + } + dst_copy = kmalloc(item_size, GFP_NOFS); + src_copy = kmalloc(item_size, GFP_NOFS); + if (!dst_copy || !src_copy) { + btrfs_release_path(path); + kfree(dst_copy); + kfree(src_copy); + return -ENOMEM; + } + + read_extent_buffer(eb, src_copy, src_ptr, item_size); + + dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, + item_size); + ret = memcmp(dst_copy, src_copy, item_size); + + kfree(dst_copy); + kfree(src_copy); + /* + * they have the same contents, just return, this saves + * us from cowing blocks in the destination tree and doing + * extra writes that may not have been done by a previous + * sync + */ + if (ret == 0) { + btrfs_release_path(path); + return 0; + } + + /* + * We need to load the old nbytes into the inode so when we + * replay the extents we've logged we get the right nbytes. + */ + if (inode_item) { + struct btrfs_inode_item *item; + u64 nbytes; + u32 mode; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + nbytes = btrfs_inode_nbytes(path->nodes[0], item); + item = btrfs_item_ptr(eb, slot, + struct btrfs_inode_item); + btrfs_set_inode_nbytes(eb, item, nbytes); + + /* + * If this is a directory we need to reset the i_size to + * 0 so that we can set it up properly when replaying + * the rest of the items in this log. + */ + mode = btrfs_inode_mode(eb, item); + if (S_ISDIR(mode)) + btrfs_set_inode_size(eb, item, 0); + } + } else if (inode_item) { + struct btrfs_inode_item *item; + u32 mode; + + /* + * New inode, set nbytes to 0 so that the nbytes comes out + * properly when we replay the extents. + */ + item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); + btrfs_set_inode_nbytes(eb, item, 0); + + /* + * If this is a directory we need to reset the i_size to 0 so + * that we can set it up properly when replaying the rest of + * the items in this log. + */ + mode = btrfs_inode_mode(eb, item); + if (S_ISDIR(mode)) + btrfs_set_inode_size(eb, item, 0); + } +insert: + btrfs_release_path(path); + /* try to insert the key into the destination tree */ + path->skip_release_on_error = 1; + ret = btrfs_insert_empty_item(trans, root, path, + key, item_size); + path->skip_release_on_error = 0; + + /* make sure any existing item is the correct size */ + if (ret == -EEXIST || ret == -EOVERFLOW) { + u32 found_size; + found_size = btrfs_item_size(path->nodes[0], + path->slots[0]); + if (found_size > item_size) + btrfs_truncate_item(trans, path, item_size, 1); + else if (found_size < item_size) + btrfs_extend_item(trans, path, item_size - found_size); + } else if (ret) { + return ret; + } + dst_ptr = btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + + /* don't overwrite an existing inode if the generation number + * was logged as zero. This is done when the tree logging code + * is just logging an inode to make sure it exists after recovery. + * + * Also, don't overwrite i_size on directories during replay. + * log replay inserts and removes directory items based on the + * state of the tree found in the subvolume, and i_size is modified + * as it goes + */ + if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { + struct btrfs_inode_item *src_item; + struct btrfs_inode_item *dst_item; + + src_item = (struct btrfs_inode_item *)src_ptr; + dst_item = (struct btrfs_inode_item *)dst_ptr; + + if (btrfs_inode_generation(eb, src_item) == 0) { + struct extent_buffer *dst_eb = path->nodes[0]; + const u64 ino_size = btrfs_inode_size(eb, src_item); + + /* + * For regular files an ino_size == 0 is used only when + * logging that an inode exists, as part of a directory + * fsync, and the inode wasn't fsynced before. In this + * case don't set the size of the inode in the fs/subvol + * tree, otherwise we would be throwing valid data away. + */ + if (S_ISREG(btrfs_inode_mode(eb, src_item)) && + S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && + ino_size != 0) + btrfs_set_inode_size(dst_eb, dst_item, ino_size); + goto no_copy; + } + + if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && + S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { + save_old_i_size = 1; + saved_i_size = btrfs_inode_size(path->nodes[0], + dst_item); + } + } + + copy_extent_buffer(path->nodes[0], eb, dst_ptr, + src_ptr, item_size); + + if (save_old_i_size) { + struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; + btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); + } + + /* make sure the generation is filled in */ + if (key->type == BTRFS_INODE_ITEM_KEY) { + struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; + if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { + btrfs_set_inode_generation(path->nodes[0], dst_item, + trans->transid); + } + } +no_copy: + btrfs_mark_buffer_dirty(trans, path->nodes[0]); + btrfs_release_path(path); + return 0; +} + +static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, + struct fscrypt_str *name) +{ + char *buf; + + buf = kmalloc(len, GFP_NOFS); + if (!buf) + return -ENOMEM; + + read_extent_buffer(eb, buf, (unsigned long)start, len); + name->name = buf; + name->len = len; + return 0; +} + +/* + * simple helper to read an inode off the disk from a given root + * This can only be called for subvolume roots and not for the log + */ +static noinline struct inode *read_one_inode(struct btrfs_root *root, + u64 objectid) +{ + struct inode *inode; + + inode = btrfs_iget(root->fs_info->sb, objectid, root); + if (IS_ERR(inode)) + inode = NULL; + return inode; +} + +/* replays a single extent in 'eb' at 'slot' with 'key' into the + * subvolume 'root'. path is released on entry and should be released + * on exit. + * + * extents in the log tree have not been allocated out of the extent + * tree yet. So, this completes the allocation, taking a reference + * as required if the extent already exists or creating a new extent + * if it isn't in the extent allocation tree yet. + * + * The extent is inserted into the file, dropping any existing extents + * from the file that overlap the new one. + */ +static noinline int replay_one_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + struct btrfs_drop_extents_args drop_args = { 0 }; + struct btrfs_fs_info *fs_info = root->fs_info; + int found_type; + u64 extent_end; + u64 start = key->offset; + u64 nbytes = 0; + struct btrfs_file_extent_item *item; + struct inode *inode = NULL; + unsigned long size; + int ret = 0; + + item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(eb, item); + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + nbytes = btrfs_file_extent_num_bytes(eb, item); + extent_end = start + nbytes; + + /* + * We don't add to the inodes nbytes if we are prealloc or a + * hole. + */ + if (btrfs_file_extent_disk_bytenr(eb, item) == 0) + nbytes = 0; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + size = btrfs_file_extent_ram_bytes(eb, item); + nbytes = btrfs_file_extent_ram_bytes(eb, item); + extent_end = ALIGN(start + size, + fs_info->sectorsize); + } else { + ret = 0; + goto out; + } + + inode = read_one_inode(root, key->objectid); + if (!inode) { + ret = -EIO; + goto out; + } + + /* + * first check to see if we already have this extent in the + * file. This must be done before the btrfs_drop_extents run + * so we don't try to drop this extent. + */ + ret = btrfs_lookup_file_extent(trans, root, path, + btrfs_ino(BTRFS_I(inode)), start, 0); + + if (ret == 0 && + (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC)) { + struct btrfs_file_extent_item cmp1; + struct btrfs_file_extent_item cmp2; + struct btrfs_file_extent_item *existing; + struct extent_buffer *leaf; + + leaf = path->nodes[0]; + existing = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + read_extent_buffer(eb, &cmp1, (unsigned long)item, + sizeof(cmp1)); + read_extent_buffer(leaf, &cmp2, (unsigned long)existing, + sizeof(cmp2)); + + /* + * we already have a pointer to this exact extent, + * we don't have to do anything + */ + if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { + btrfs_release_path(path); + goto out; + } + } + btrfs_release_path(path); + + /* drop any overlapping extents */ + drop_args.start = start; + drop_args.end = extent_end; + drop_args.drop_cache = true; + ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); + if (ret) + goto out; + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 offset; + unsigned long dest_offset; + struct btrfs_key ins; + + if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && + btrfs_fs_incompat(fs_info, NO_HOLES)) + goto update_inode; + + ret = btrfs_insert_empty_item(trans, root, path, key, + sizeof(*item)); + if (ret) + goto out; + dest_offset = btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + copy_extent_buffer(path->nodes[0], eb, dest_offset, + (unsigned long)item, sizeof(*item)); + + ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); + ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); + ins.type = BTRFS_EXTENT_ITEM_KEY; + offset = key->offset - btrfs_file_extent_offset(eb, item); + + /* + * Manually record dirty extent, as here we did a shallow + * file extent item copy and skip normal backref update, + * but modifying extent tree all by ourselves. + * So need to manually record dirty extent for qgroup, + * as the owner of the file extent changed from log tree + * (doesn't affect qgroup) to fs/file tree(affects qgroup) + */ + ret = btrfs_qgroup_trace_extent(trans, + btrfs_file_extent_disk_bytenr(eb, item), + btrfs_file_extent_disk_num_bytes(eb, item)); + if (ret < 0) + goto out; + + if (ins.objectid > 0) { + struct btrfs_ref ref = { 0 }; + u64 csum_start; + u64 csum_end; + LIST_HEAD(ordered_sums); + + /* + * is this extent already allocated in the extent + * allocation tree? If so, just add a reference + */ + ret = btrfs_lookup_data_extent(fs_info, ins.objectid, + ins.offset); + if (ret < 0) { + goto out; + } else if (ret == 0) { + btrfs_init_generic_ref(&ref, + BTRFS_ADD_DELAYED_REF, + ins.objectid, ins.offset, 0); + btrfs_init_data_ref(&ref, + root->root_key.objectid, + key->objectid, offset, 0, false); + ret = btrfs_inc_extent_ref(trans, &ref); + if (ret) + goto out; + } else { + /* + * insert the extent pointer in the extent + * allocation tree + */ + ret = btrfs_alloc_logged_file_extent(trans, + root->root_key.objectid, + key->objectid, offset, &ins); + if (ret) + goto out; + } + btrfs_release_path(path); + + if (btrfs_file_extent_compression(eb, item)) { + csum_start = ins.objectid; + csum_end = csum_start + ins.offset; + } else { + csum_start = ins.objectid + + btrfs_file_extent_offset(eb, item); + csum_end = csum_start + + btrfs_file_extent_num_bytes(eb, item); + } + + ret = btrfs_lookup_csums_list(root->log_root, + csum_start, csum_end - 1, + &ordered_sums, 0, false); + if (ret) + goto out; + /* + * Now delete all existing cums in the csum root that + * cover our range. We do this because we can have an + * extent that is completely referenced by one file + * extent item and partially referenced by another + * file extent item (like after using the clone or + * extent_same ioctls). In this case if we end up doing + * the replay of the one that partially references the + * extent first, and we do not do the csum deletion + * below, we can get 2 csum items in the csum tree that + * overlap each other. For example, imagine our log has + * the two following file extent items: + * + * key (257 EXTENT_DATA 409600) + * extent data disk byte 12845056 nr 102400 + * extent data offset 20480 nr 20480 ram 102400 + * + * key (257 EXTENT_DATA 819200) + * extent data disk byte 12845056 nr 102400 + * extent data offset 0 nr 102400 ram 102400 + * + * Where the second one fully references the 100K extent + * that starts at disk byte 12845056, and the log tree + * has a single csum item that covers the entire range + * of the extent: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * + * After the first file extent item is replayed, the + * csum tree gets the following csum item: + * + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which covers the 20K sub-range starting at offset 20K + * of our extent. Now when we replay the second file + * extent item, if we do not delete existing csum items + * that cover any of its blocks, we end up getting two + * csum items in our csum tree that overlap each other: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which is a problem, because after this anyone trying + * to lookup up for the checksum of any block of our + * extent starting at an offset of 40K or higher, will + * end up looking at the second csum item only, which + * does not contain the checksum for any block starting + * at offset 40K or higher of our extent. + */ + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums; + struct btrfs_root *csum_root; + + sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + csum_root = btrfs_csum_root(fs_info, + sums->logical); + if (!ret) + ret = btrfs_del_csums(trans, csum_root, + sums->logical, + sums->len); + if (!ret) + ret = btrfs_csum_file_blocks(trans, + csum_root, + sums); + list_del(&sums->list); + kfree(sums); + } + if (ret) + goto out; + } else { + btrfs_release_path(path); + } + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + /* inline extents are easy, we just overwrite them */ + ret = overwrite_item(trans, root, path, eb, slot, key); + if (ret) + goto out; + } + + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, + extent_end - start); + if (ret) + goto out; + +update_inode: + btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); +out: + iput(inode); + return ret; +} + +static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, + struct btrfs_inode *inode, + const struct fscrypt_str *name) +{ + int ret; + + ret = btrfs_unlink_inode(trans, dir, inode, name); + if (ret) + return ret; + /* + * Whenever we need to check if a name exists or not, we check the + * fs/subvolume tree. So after an unlink we must run delayed items, so + * that future checks for a name during log replay see that the name + * does not exists anymore. + */ + return btrfs_run_delayed_items(trans); +} + +/* + * when cleaning up conflicts between the directory names in the + * subvolume, directory names in the log and directory names in the + * inode back references, we may have to unlink inodes from directories. + * + * This is a helper function to do the unlink of a specific directory + * item + */ +static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_inode *dir, + struct btrfs_dir_item *di) +{ + struct btrfs_root *root = dir->root; + struct inode *inode; + struct fscrypt_str name; + struct extent_buffer *leaf; + struct btrfs_key location; + int ret; + + leaf = path->nodes[0]; + + btrfs_dir_item_key_to_cpu(leaf, di, &location); + ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name); + if (ret) + return -ENOMEM; + + btrfs_release_path(path); + + inode = read_one_inode(root, location.objectid); + if (!inode) { + ret = -EIO; + goto out; + } + + ret = link_to_fixup_dir(trans, root, path, location.objectid); + if (ret) + goto out; + + ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name); +out: + kfree(name.name); + iput(inode); + return ret; +} + +/* + * See if a given name and sequence number found in an inode back reference are + * already in a directory and correctly point to this inode. + * + * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it + * exists. + */ +static noinline int inode_in_dir(struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, u64 objectid, u64 index, + struct fscrypt_str *name) +{ + struct btrfs_dir_item *di; + struct btrfs_key location; + int ret = 0; + + di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, + index, name, 0); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } else if (di) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid != objectid) + goto out; + } else { + goto out; + } + + btrfs_release_path(path); + di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } else if (di) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid == objectid) + ret = 1; + } +out: + btrfs_release_path(path); + return ret; +} + +/* + * helper function to check a log tree for a named back reference in + * an inode. This is used to decide if a back reference that is + * found in the subvolume conflicts with what we find in the log. + * + * inode backreferences may have multiple refs in a single item, + * during replay we process one reference at a time, and we don't + * want to delete valid links to a file from the subvolume if that + * link is also in the log. + */ +static noinline int backref_in_log(struct btrfs_root *log, + struct btrfs_key *key, + u64 ref_objectid, + const struct fscrypt_str *name) +{ + struct btrfs_path *path; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, log, key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret == 1) { + ret = 0; + goto out; + } + + if (key->type == BTRFS_INODE_EXTREF_KEY) + ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], + ref_objectid, name); + else + ret = !!btrfs_find_name_in_backref(path->nodes[0], + path->slots[0], name); +out: + btrfs_free_path(path); + return ret; +} + +static inline int __add_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_root *log_root, + struct btrfs_inode *dir, + struct btrfs_inode *inode, + u64 inode_objectid, u64 parent_objectid, + u64 ref_index, struct fscrypt_str *name) +{ + int ret; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key search_key; + struct btrfs_inode_extref *extref; + +again: + /* Search old style refs */ + search_key.objectid = inode_objectid; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = parent_objectid; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret == 0) { + struct btrfs_inode_ref *victim_ref; + unsigned long ptr; + unsigned long ptr_end; + + leaf = path->nodes[0]; + + /* are we trying to overwrite a back ref for the root directory + * if so, just jump out, we're done + */ + if (search_key.objectid == search_key.offset) + return 1; + + /* check all the names in this back reference to see + * if they are in the log. if so, we allow them to stay + * otherwise they must be unlinked as a conflict + */ + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); + while (ptr < ptr_end) { + struct fscrypt_str victim_name; + + victim_ref = (struct btrfs_inode_ref *)ptr; + ret = read_alloc_one_name(leaf, (victim_ref + 1), + btrfs_inode_ref_name_len(leaf, victim_ref), + &victim_name); + if (ret) + return ret; + + ret = backref_in_log(log_root, &search_key, + parent_objectid, &victim_name); + if (ret < 0) { + kfree(victim_name.name); + return ret; + } else if (!ret) { + inc_nlink(&inode->vfs_inode); + btrfs_release_path(path); + + ret = unlink_inode_for_log_replay(trans, dir, inode, + &victim_name); + kfree(victim_name.name); + if (ret) + return ret; + goto again; + } + kfree(victim_name.name); + + ptr = (unsigned long)(victim_ref + 1) + victim_name.len; + } + } + btrfs_release_path(path); + + /* Same search but for extended refs */ + extref = btrfs_lookup_inode_extref(NULL, root, path, name, + inode_objectid, parent_objectid, 0, + 0); + if (IS_ERR(extref)) { + return PTR_ERR(extref); + } else if (extref) { + u32 item_size; + u32 cur_offset = 0; + unsigned long base; + struct inode *victim_parent; + + leaf = path->nodes[0]; + + item_size = btrfs_item_size(leaf, path->slots[0]); + base = btrfs_item_ptr_offset(leaf, path->slots[0]); + + while (cur_offset < item_size) { + struct fscrypt_str victim_name; + + extref = (struct btrfs_inode_extref *)(base + cur_offset); + + if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) + goto next; + + ret = read_alloc_one_name(leaf, &extref->name, + btrfs_inode_extref_name_len(leaf, extref), + &victim_name); + if (ret) + return ret; + + search_key.objectid = inode_objectid; + search_key.type = BTRFS_INODE_EXTREF_KEY; + search_key.offset = btrfs_extref_hash(parent_objectid, + victim_name.name, + victim_name.len); + ret = backref_in_log(log_root, &search_key, + parent_objectid, &victim_name); + if (ret < 0) { + kfree(victim_name.name); + return ret; + } else if (!ret) { + ret = -ENOENT; + victim_parent = read_one_inode(root, + parent_objectid); + if (victim_parent) { + inc_nlink(&inode->vfs_inode); + btrfs_release_path(path); + + ret = unlink_inode_for_log_replay(trans, + BTRFS_I(victim_parent), + inode, &victim_name); + } + iput(victim_parent); + kfree(victim_name.name); + if (ret) + return ret; + goto again; + } + kfree(victim_name.name); +next: + cur_offset += victim_name.len + sizeof(*extref); + } + } + btrfs_release_path(path); + + /* look for a conflicting sequence number */ + di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), + ref_index, name, 0); + if (IS_ERR(di)) { + return PTR_ERR(di); + } else if (di) { + ret = drop_one_dir_item(trans, path, dir, di); + if (ret) + return ret; + } + btrfs_release_path(path); + + /* look for a conflicting name */ + di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0); + if (IS_ERR(di)) { + return PTR_ERR(di); + } else if (di) { + ret = drop_one_dir_item(trans, path, dir, di); + if (ret) + return ret; + } + btrfs_release_path(path); + + return 0; +} + +static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + struct fscrypt_str *name, u64 *index, + u64 *parent_objectid) +{ + struct btrfs_inode_extref *extref; + int ret; + + extref = (struct btrfs_inode_extref *)ref_ptr; + + ret = read_alloc_one_name(eb, &extref->name, + btrfs_inode_extref_name_len(eb, extref), name); + if (ret) + return ret; + + if (index) + *index = btrfs_inode_extref_index(eb, extref); + if (parent_objectid) + *parent_objectid = btrfs_inode_extref_parent(eb, extref); + + return 0; +} + +static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + struct fscrypt_str *name, u64 *index) +{ + struct btrfs_inode_ref *ref; + int ret; + + ref = (struct btrfs_inode_ref *)ref_ptr; + + ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref), + name); + if (ret) + return ret; + + if (index) + *index = btrfs_inode_ref_index(eb, ref); + + return 0; +} + +/* + * Take an inode reference item from the log tree and iterate all names from the + * inode reference item in the subvolume tree with the same key (if it exists). + * For any name that is not in the inode reference item from the log tree, do a + * proper unlink of that name (that is, remove its entry from the inode + * reference item and both dir index keys). + */ +static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_inode *inode, + struct extent_buffer *log_eb, + int log_slot, + struct btrfs_key *key) +{ + int ret; + unsigned long ref_ptr; + unsigned long ref_end; + struct extent_buffer *eb; + +again: + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret > 0) { + ret = 0; + goto out; + } + if (ret < 0) + goto out; + + eb = path->nodes[0]; + ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); + ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); + while (ref_ptr < ref_end) { + struct fscrypt_str name; + u64 parent_id; + + if (key->type == BTRFS_INODE_EXTREF_KEY) { + ret = extref_get_fields(eb, ref_ptr, &name, + NULL, &parent_id); + } else { + parent_id = key->offset; + ret = ref_get_fields(eb, ref_ptr, &name, NULL); + } + if (ret) + goto out; + + if (key->type == BTRFS_INODE_EXTREF_KEY) + ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, + parent_id, &name); + else + ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); + + if (!ret) { + struct inode *dir; + + btrfs_release_path(path); + dir = read_one_inode(root, parent_id); + if (!dir) { + ret = -ENOENT; + kfree(name.name); + goto out; + } + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), + inode, &name); + kfree(name.name); + iput(dir); + if (ret) + goto out; + goto again; + } + + kfree(name.name); + ref_ptr += name.len; + if (key->type == BTRFS_INODE_EXTREF_KEY) + ref_ptr += sizeof(struct btrfs_inode_extref); + else + ref_ptr += sizeof(struct btrfs_inode_ref); + } + ret = 0; + out: + btrfs_release_path(path); + return ret; +} + +/* + * replay one inode back reference item found in the log tree. + * eb, slot and key refer to the buffer and key found in the log tree. + * root is the destination we are replaying into, and path is for temp + * use by this function. (it should be released on return). + */ +static noinline int add_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + struct inode *dir = NULL; + struct inode *inode = NULL; + unsigned long ref_ptr; + unsigned long ref_end; + struct fscrypt_str name; + int ret; + int log_ref_ver = 0; + u64 parent_objectid; + u64 inode_objectid; + u64 ref_index = 0; + int ref_struct_size; + + ref_ptr = btrfs_item_ptr_offset(eb, slot); + ref_end = ref_ptr + btrfs_item_size(eb, slot); + + if (key->type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *r; + + ref_struct_size = sizeof(struct btrfs_inode_extref); + log_ref_ver = 1; + r = (struct btrfs_inode_extref *)ref_ptr; + parent_objectid = btrfs_inode_extref_parent(eb, r); + } else { + ref_struct_size = sizeof(struct btrfs_inode_ref); + parent_objectid = key->offset; + } + inode_objectid = key->objectid; + + /* + * it is possible that we didn't log all the parent directories + * for a given inode. If we don't find the dir, just don't + * copy the back ref in. The link count fixup code will take + * care of the rest + */ + dir = read_one_inode(root, parent_objectid); + if (!dir) { + ret = -ENOENT; + goto out; + } + + inode = read_one_inode(root, inode_objectid); + if (!inode) { + ret = -EIO; + goto out; + } + + while (ref_ptr < ref_end) { + if (log_ref_ver) { + ret = extref_get_fields(eb, ref_ptr, &name, + &ref_index, &parent_objectid); + /* + * parent object can change from one array + * item to another. + */ + if (!dir) + dir = read_one_inode(root, parent_objectid); + if (!dir) { + ret = -ENOENT; + goto out; + } + } else { + ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); + } + if (ret) + goto out; + + ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), + btrfs_ino(BTRFS_I(inode)), ref_index, &name); + if (ret < 0) { + goto out; + } else if (ret == 0) { + /* + * look for a conflicting back reference in the + * metadata. if we find one we have to unlink that name + * of the file before we add our new link. Later on, we + * overwrite any existing back reference, and we don't + * want to create dangling pointers in the directory. + */ + ret = __add_inode_ref(trans, root, path, log, + BTRFS_I(dir), BTRFS_I(inode), + inode_objectid, parent_objectid, + ref_index, &name); + if (ret) { + if (ret == 1) + ret = 0; + goto out; + } + + /* insert our name */ + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + &name, 0, ref_index); + if (ret) + goto out; + + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + if (ret) + goto out; + } + /* Else, ret == 1, we already have a perfect match, we're done. */ + + ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len; + kfree(name.name); + name.name = NULL; + if (log_ref_ver) { + iput(dir); + dir = NULL; + } + } + + /* + * Before we overwrite the inode reference item in the subvolume tree + * with the item from the log tree, we must unlink all names from the + * parent directory that are in the subvolume's tree inode reference + * item, otherwise we end up with an inconsistent subvolume tree where + * dir index entries exist for a name but there is no inode reference + * item with the same name. + */ + ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, + key); + if (ret) + goto out; + + /* finally write the back reference in the inode */ + ret = overwrite_item(trans, root, path, eb, slot, key); +out: + btrfs_release_path(path); + kfree(name.name); + iput(dir); + iput(inode); + return ret; +} + +static int count_inode_extrefs(struct btrfs_root *root, + struct btrfs_inode *inode, struct btrfs_path *path) +{ + int ret = 0; + int name_len; + unsigned int nlink = 0; + u32 item_size; + u32 cur_offset = 0; + u64 inode_objectid = btrfs_ino(inode); + u64 offset = 0; + unsigned long ptr; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + + while (1) { + ret = btrfs_find_one_extref(root, inode_objectid, offset, path, + &extref, &offset); + if (ret) + break; + + leaf = path->nodes[0]; + item_size = btrfs_item_size(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + cur_offset = 0; + + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *) (ptr + cur_offset); + name_len = btrfs_inode_extref_name_len(leaf, extref); + + nlink++; + + cur_offset += name_len + sizeof(*extref); + } + + offset++; + btrfs_release_path(path); + } + btrfs_release_path(path); + + if (ret < 0 && ret != -ENOENT) + return ret; + return nlink; +} + +static int count_inode_refs(struct btrfs_root *root, + struct btrfs_inode *inode, struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + unsigned int nlink = 0; + unsigned long ptr; + unsigned long ptr_end; + int name_len; + u64 ino = btrfs_ino(inode); + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } +process_slot: + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid != ino || + key.type != BTRFS_INODE_REF_KEY) + break; + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + ptr_end = ptr + btrfs_item_size(path->nodes[0], + path->slots[0]); + while (ptr < ptr_end) { + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ptr; + name_len = btrfs_inode_ref_name_len(path->nodes[0], + ref); + ptr = (unsigned long)(ref + 1) + name_len; + nlink++; + } + + if (key.offset == 0) + break; + if (path->slots[0] > 0) { + path->slots[0]--; + goto process_slot; + } + key.offset--; + btrfs_release_path(path); + } + btrfs_release_path(path); + + return nlink; +} + +/* + * There are a few corners where the link count of the file can't + * be properly maintained during replay. So, instead of adding + * lots of complexity to the log code, we just scan the backrefs + * for any file that has been through replay. + * + * The scan will update the link count on the inode to reflect the + * number of back refs found. If it goes down to zero, the iput + * will free the inode. + */ +static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + struct btrfs_path *path; + int ret; + u64 nlink = 0; + u64 ino = btrfs_ino(BTRFS_I(inode)); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = count_inode_refs(root, BTRFS_I(inode), path); + if (ret < 0) + goto out; + + nlink = ret; + + ret = count_inode_extrefs(root, BTRFS_I(inode), path); + if (ret < 0) + goto out; + + nlink += ret; + + ret = 0; + + if (nlink != inode->i_nlink) { + set_nlink(inode, nlink); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + if (ret) + goto out; + } + BTRFS_I(inode)->index_cnt = (u64)-1; + + if (inode->i_nlink == 0) { + if (S_ISDIR(inode->i_mode)) { + ret = replay_dir_deletes(trans, root, NULL, path, + ino, 1); + if (ret) + goto out; + } + ret = btrfs_insert_orphan_item(trans, root, ino); + if (ret == -EEXIST) + ret = 0; + } + +out: + btrfs_free_path(path); + return ret; +} + +static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + struct inode *inode; + + key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = (u64)-1; + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + break; + + if (ret == 1) { + ret = 0; + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || + key.type != BTRFS_ORPHAN_ITEM_KEY) + break; + + ret = btrfs_del_item(trans, root, path); + if (ret) + break; + + btrfs_release_path(path); + inode = read_one_inode(root, key.offset); + if (!inode) { + ret = -EIO; + break; + } + + ret = fixup_inode_link_count(trans, root, inode); + iput(inode); + if (ret) + break; + + /* + * fixup on a directory may create new entries, + * make sure we always look for the highset possible + * offset + */ + key.offset = (u64)-1; + } + btrfs_release_path(path); + return ret; +} + + +/* + * record a given inode in the fixup dir so we can check its link + * count when replay is done. The link count is incremented here + * so the inode won't go away until we check it + */ +static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid) +{ + struct btrfs_key key; + int ret = 0; + struct inode *inode; + + inode = read_one_inode(root, objectid); + if (!inode) + return -EIO; + + key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = objectid; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + + btrfs_release_path(path); + if (ret == 0) { + if (!inode->i_nlink) + set_nlink(inode, 1); + else + inc_nlink(inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + } else if (ret == -EEXIST) { + ret = 0; + } + iput(inode); + + return ret; +} + +/* + * when replaying the log for a directory, we only insert names + * for inodes that actually exist. This means an fsync on a directory + * does not implicitly fsync all the new files in it + */ +static noinline int insert_one_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 dirid, u64 index, + const struct fscrypt_str *name, + struct btrfs_key *location) +{ + struct inode *inode; + struct inode *dir; + int ret; + + inode = read_one_inode(root, location->objectid); + if (!inode) + return -ENOENT; + + dir = read_one_inode(root, dirid); + if (!dir) { + iput(inode); + return -EIO; + } + + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, + 1, index); + + /* FIXME, put inode into FIXUP list */ + + iput(inode); + iput(dir); + return ret; +} + +static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, + struct btrfs_path *path, + struct btrfs_dir_item *dst_di, + const struct btrfs_key *log_key, + u8 log_flags, + bool exists) +{ + struct btrfs_key found_key; + + btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); + /* The existing dentry points to the same inode, don't delete it. */ + if (found_key.objectid == log_key->objectid && + found_key.type == log_key->type && + found_key.offset == log_key->offset && + btrfs_dir_flags(path->nodes[0], dst_di) == log_flags) + return 1; + + /* + * Don't drop the conflicting directory entry if the inode for the new + * entry doesn't exist. + */ + if (!exists) + return 0; + + return drop_one_dir_item(trans, path, dir, dst_di); +} + +/* + * take a single entry in a log directory item and replay it into + * the subvolume. + * + * if a conflicting item exists in the subdirectory already, + * the inode it points to is unlinked and put into the link count + * fix up tree. + * + * If a name from the log points to a file or directory that does + * not exist in the FS, it is skipped. fsyncs on directories + * do not force down inodes inside that directory, just changes to the + * names or unlinks in a directory. + * + * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a + * non-existing inode) and 1 if the name was replayed. + */ +static noinline int replay_one_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, + struct btrfs_dir_item *di, + struct btrfs_key *key) +{ + struct fscrypt_str name; + struct btrfs_dir_item *dir_dst_di; + struct btrfs_dir_item *index_dst_di; + bool dir_dst_matches = false; + bool index_dst_matches = false; + struct btrfs_key log_key; + struct btrfs_key search_key; + struct inode *dir; + u8 log_flags; + bool exists; + int ret; + bool update_size = true; + bool name_added = false; + + dir = read_one_inode(root, key->objectid); + if (!dir) + return -EIO; + + ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); + if (ret) + goto out; + + log_flags = btrfs_dir_flags(eb, di); + btrfs_dir_item_key_to_cpu(eb, di, &log_key); + ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); + btrfs_release_path(path); + if (ret < 0) + goto out; + exists = (ret == 0); + ret = 0; + + dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, + &name, 1); + if (IS_ERR(dir_dst_di)) { + ret = PTR_ERR(dir_dst_di); + goto out; + } else if (dir_dst_di) { + ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, + dir_dst_di, &log_key, + log_flags, exists); + if (ret < 0) + goto out; + dir_dst_matches = (ret == 1); + } + + btrfs_release_path(path); + + index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, + key->objectid, key->offset, + &name, 1); + if (IS_ERR(index_dst_di)) { + ret = PTR_ERR(index_dst_di); + goto out; + } else if (index_dst_di) { + ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, + index_dst_di, &log_key, + log_flags, exists); + if (ret < 0) + goto out; + index_dst_matches = (ret == 1); + } + + btrfs_release_path(path); + + if (dir_dst_matches && index_dst_matches) { + ret = 0; + update_size = false; + goto out; + } + + /* + * Check if the inode reference exists in the log for the given name, + * inode and parent inode + */ + search_key.objectid = log_key.objectid; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = key->objectid; + ret = backref_in_log(root->log_root, &search_key, 0, &name); + if (ret < 0) { + goto out; + } else if (ret) { + /* The dentry will be added later. */ + ret = 0; + update_size = false; + goto out; + } + + search_key.objectid = log_key.objectid; + search_key.type = BTRFS_INODE_EXTREF_KEY; + search_key.offset = key->objectid; + ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); + if (ret < 0) { + goto out; + } else if (ret) { + /* The dentry will be added later. */ + ret = 0; + update_size = false; + goto out; + } + btrfs_release_path(path); + ret = insert_one_name(trans, root, key->objectid, key->offset, + &name, &log_key); + if (ret && ret != -ENOENT && ret != -EEXIST) + goto out; + if (!ret) + name_added = true; + update_size = false; + ret = 0; + +out: + if (!ret && update_size) { + btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); + ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); + } + kfree(name.name); + iput(dir); + if (!ret && name_added) + ret = 1; + return ret; +} + +/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ +static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + struct btrfs_dir_item *di; + + /* We only log dir index keys, which only contain a single dir item. */ + ASSERT(key->type == BTRFS_DIR_INDEX_KEY); + + di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + ret = replay_one_name(trans, root, path, eb, di, key); + if (ret < 0) + return ret; + + /* + * If this entry refers to a non-directory (directories can not have a + * link count > 1) and it was added in the transaction that was not + * committed, make sure we fixup the link count of the inode the entry + * points to. Otherwise something like the following would result in a + * directory pointing to an inode with a wrong link that does not account + * for this dir entry: + * + * mkdir testdir + * touch testdir/foo + * touch testdir/bar + * sync + * + * ln testdir/bar testdir/bar_link + * ln testdir/foo testdir/foo_link + * xfs_io -c "fsync" testdir/bar + * + * + * + * mount fs, log replay happens + * + * File foo would remain with a link count of 1 when it has two entries + * pointing to it in the directory testdir. This would make it impossible + * to ever delete the parent directory has it would result in stale + * dentries that can never be deleted. + */ + if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) { + struct btrfs_path *fixup_path; + struct btrfs_key di_key; + + fixup_path = btrfs_alloc_path(); + if (!fixup_path) + return -ENOMEM; + + btrfs_dir_item_key_to_cpu(eb, di, &di_key); + ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); + btrfs_free_path(fixup_path); + } + + return ret; +} + +/* + * directory replay has two parts. There are the standard directory + * items in the log copied from the subvolume, and range items + * created in the log while the subvolume was logged. + * + * The range items tell us which parts of the key space the log + * is authoritative for. During replay, if a key in the subvolume + * directory is in a logged range item, but not actually in the log + * that means it was deleted from the directory before the fsync + * and should be removed. + */ +static noinline int find_dir_range(struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, + u64 *start_ret, u64 *end_ret) +{ + struct btrfs_key key; + u64 found_end; + struct btrfs_dir_log_item *item; + int ret; + int nritems; + + if (*start_ret == (u64)-1) + return 1; + + key.objectid = dirid; + key.type = BTRFS_DIR_LOG_INDEX_KEY; + key.offset = *start_ret; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + if (ret != 0) + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { + ret = 1; + goto next; + } + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + found_end = btrfs_dir_log_end(path->nodes[0], item); + + if (*start_ret >= key.offset && *start_ret <= found_end) { + ret = 0; + *start_ret = key.offset; + *end_ret = found_end; + goto out; + } + ret = 1; +next: + /* check the next slot in the tree to see if it is a valid item */ + nritems = btrfs_header_nritems(path->nodes[0]); + path->slots[0]++; + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { + ret = 1; + goto out; + } + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + found_end = btrfs_dir_log_end(path->nodes[0], item); + *start_ret = key.offset; + *end_ret = found_end; + ret = 0; +out: + btrfs_release_path(path); + return ret; +} + +/* + * this looks for a given directory item in the log. If the directory + * item is not in the log, the item is removed and the inode it points + * to is unlinked + */ +static noinline int check_item_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + struct btrfs_path *log_path, + struct inode *dir, + struct btrfs_key *dir_key) +{ + struct btrfs_root *root = BTRFS_I(dir)->root; + int ret; + struct extent_buffer *eb; + int slot; + struct btrfs_dir_item *di; + struct fscrypt_str name; + struct inode *inode = NULL; + struct btrfs_key location; + + /* + * Currently we only log dir index keys. Even if we replay a log created + * by an older kernel that logged both dir index and dir item keys, all + * we need to do is process the dir index keys, we (and our caller) can + * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). + */ + ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); + + eb = path->nodes[0]; + slot = path->slots[0]; + di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); + if (ret) + goto out; + + if (log) { + struct btrfs_dir_item *log_di; + + log_di = btrfs_lookup_dir_index_item(trans, log, log_path, + dir_key->objectid, + dir_key->offset, &name, 0); + if (IS_ERR(log_di)) { + ret = PTR_ERR(log_di); + goto out; + } else if (log_di) { + /* The dentry exists in the log, we have nothing to do. */ + ret = 0; + goto out; + } + } + + btrfs_dir_item_key_to_cpu(eb, di, &location); + btrfs_release_path(path); + btrfs_release_path(log_path); + inode = read_one_inode(root, location.objectid); + if (!inode) { + ret = -EIO; + goto out; + } + + ret = link_to_fixup_dir(trans, root, path, location.objectid); + if (ret) + goto out; + + inc_nlink(inode); + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), + &name); + /* + * Unlike dir item keys, dir index keys can only have one name (entry) in + * them, as there are no key collisions since each key has a unique offset + * (an index number), so we're done. + */ +out: + btrfs_release_path(path); + btrfs_release_path(log_path); + kfree(name.name); + iput(inode); + return ret; +} + +static int replay_xattr_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + const u64 ino) +{ + struct btrfs_key search_key; + struct btrfs_path *log_path; + int i; + int nritems; + int ret; + + log_path = btrfs_alloc_path(); + if (!log_path) + return -ENOMEM; + + search_key.objectid = ino; + search_key.type = BTRFS_XATTR_ITEM_KEY; + search_key.offset = 0; +again: + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; +process_leaf: + nritems = btrfs_header_nritems(path->nodes[0]); + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_key key; + struct btrfs_dir_item *di; + struct btrfs_dir_item *log_di; + u32 total_size; + u32 cur; + + btrfs_item_key_to_cpu(path->nodes[0], &key, i); + if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { + ret = 0; + goto out; + } + + di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); + total_size = btrfs_item_size(path->nodes[0], i); + cur = 0; + while (cur < total_size) { + u16 name_len = btrfs_dir_name_len(path->nodes[0], di); + u16 data_len = btrfs_dir_data_len(path->nodes[0], di); + u32 this_len = sizeof(*di) + name_len + data_len; + char *name; + + name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(path->nodes[0], name, + (unsigned long)(di + 1), name_len); + + log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, + name, name_len, 0); + btrfs_release_path(log_path); + if (!log_di) { + /* Doesn't exist in log tree, so delete it. */ + btrfs_release_path(path); + di = btrfs_lookup_xattr(trans, root, path, ino, + name, name_len, -1); + kfree(name); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + ASSERT(di); + ret = btrfs_delete_one_dir_name(trans, root, + path, di); + if (ret) + goto out; + btrfs_release_path(path); + search_key = key; + goto again; + } + kfree(name); + if (IS_ERR(log_di)) { + ret = PTR_ERR(log_di); + goto out; + } + cur += this_len; + di = (struct btrfs_dir_item *)((char *)di + this_len); + } + } + ret = btrfs_next_leaf(root, path); + if (ret > 0) + ret = 0; + else if (ret == 0) + goto process_leaf; +out: + btrfs_free_path(log_path); + btrfs_release_path(path); + return ret; +} + + +/* + * deletion replay happens before we copy any new directory items + * out of the log or out of backreferences from inodes. It + * scans the log to find ranges of keys that log is authoritative for, + * and then scans the directory to find items in those ranges that are + * not present in the log. + * + * Anything we don't find in the log is unlinked and removed from the + * directory. + */ +static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dirid, int del_all) +{ + u64 range_start; + u64 range_end; + int ret = 0; + struct btrfs_key dir_key; + struct btrfs_key found_key; + struct btrfs_path *log_path; + struct inode *dir; + + dir_key.objectid = dirid; + dir_key.type = BTRFS_DIR_INDEX_KEY; + log_path = btrfs_alloc_path(); + if (!log_path) + return -ENOMEM; + + dir = read_one_inode(root, dirid); + /* it isn't an error if the inode isn't there, that can happen + * because we replay the deletes before we copy in the inode item + * from the log + */ + if (!dir) { + btrfs_free_path(log_path); + return 0; + } + + range_start = 0; + range_end = 0; + while (1) { + if (del_all) + range_end = (u64)-1; + else { + ret = find_dir_range(log, path, dirid, + &range_start, &range_end); + if (ret < 0) + goto out; + else if (ret > 0) + break; + } + + dir_key.offset = range_start; + while (1) { + int nritems; + ret = btrfs_search_slot(NULL, root, &dir_key, path, + 0, 0); + if (ret < 0) + goto out; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret == 1) + break; + else if (ret < 0) + goto out; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != dirid || + found_key.type != dir_key.type) { + ret = 0; + goto out; + } + + if (found_key.offset > range_end) + break; + + ret = check_item_in_log(trans, log, path, + log_path, dir, + &found_key); + if (ret) + goto out; + if (found_key.offset == (u64)-1) + break; + dir_key.offset = found_key.offset + 1; + } + btrfs_release_path(path); + if (range_end == (u64)-1) + break; + range_start = range_end + 1; + } + ret = 0; +out: + btrfs_release_path(path); + btrfs_free_path(log_path); + iput(dir); + return ret; +} + +/* + * the process_func used to replay items from the log tree. This + * gets called in two different stages. The first stage just looks + * for inodes and makes sure they are all copied into the subvolume. + * + * The second stage copies all the other item types from the log into + * the subvolume. The two stage approach is slower, but gets rid of + * lots of complexity around inodes referencing other inodes that exist + * only in the log (references come from either directory items or inode + * back refs). + */ +static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, + struct walk_control *wc, u64 gen, int level) +{ + int nritems; + struct btrfs_tree_parent_check check = { + .transid = gen, + .level = level + }; + struct btrfs_path *path; + struct btrfs_root *root = wc->replay_dest; + struct btrfs_key key; + int i; + int ret; + + ret = btrfs_read_extent_buffer(eb, &check); + if (ret) + return ret; + + level = btrfs_header_level(eb); + + if (level != 0) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + nritems = btrfs_header_nritems(eb); + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(eb, &key, i); + + /* inode keys are done during the first stage */ + if (key.type == BTRFS_INODE_ITEM_KEY && + wc->stage == LOG_WALK_REPLAY_INODES) { + struct btrfs_inode_item *inode_item; + u32 mode; + + inode_item = btrfs_item_ptr(eb, i, + struct btrfs_inode_item); + /* + * If we have a tmpfile (O_TMPFILE) that got fsync'ed + * and never got linked before the fsync, skip it, as + * replaying it is pointless since it would be deleted + * later. We skip logging tmpfiles, but it's always + * possible we are replaying a log created with a kernel + * that used to log tmpfiles. + */ + if (btrfs_inode_nlink(eb, inode_item) == 0) { + wc->ignore_cur_inode = true; + continue; + } else { + wc->ignore_cur_inode = false; + } + ret = replay_xattr_deletes(wc->trans, root, log, + path, key.objectid); + if (ret) + break; + mode = btrfs_inode_mode(eb, inode_item); + if (S_ISDIR(mode)) { + ret = replay_dir_deletes(wc->trans, + root, log, path, key.objectid, 0); + if (ret) + break; + } + ret = overwrite_item(wc->trans, root, path, + eb, i, &key); + if (ret) + break; + + /* + * Before replaying extents, truncate the inode to its + * size. We need to do it now and not after log replay + * because before an fsync we can have prealloc extents + * added beyond the inode's i_size. If we did it after, + * through orphan cleanup for example, we would drop + * those prealloc extents just after replaying them. + */ + if (S_ISREG(mode)) { + struct btrfs_drop_extents_args drop_args = { 0 }; + struct inode *inode; + u64 from; + + inode = read_one_inode(root, key.objectid); + if (!inode) { + ret = -EIO; + break; + } + from = ALIGN(i_size_read(inode), + root->fs_info->sectorsize); + drop_args.start = from; + drop_args.end = (u64)-1; + drop_args.drop_cache = true; + ret = btrfs_drop_extents(wc->trans, root, + BTRFS_I(inode), + &drop_args); + if (!ret) { + inode_sub_bytes(inode, + drop_args.bytes_found); + /* Update the inode's nbytes. */ + ret = btrfs_update_inode(wc->trans, + root, BTRFS_I(inode)); + } + iput(inode); + if (ret) + break; + } + + ret = link_to_fixup_dir(wc->trans, root, + path, key.objectid); + if (ret) + break; + } + + if (wc->ignore_cur_inode) + continue; + + if (key.type == BTRFS_DIR_INDEX_KEY && + wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { + ret = replay_one_dir_item(wc->trans, root, path, + eb, i, &key); + if (ret) + break; + } + + if (wc->stage < LOG_WALK_REPLAY_ALL) + continue; + + /* these keys are simply copied */ + if (key.type == BTRFS_XATTR_ITEM_KEY) { + ret = overwrite_item(wc->trans, root, path, + eb, i, &key); + if (ret) + break; + } else if (key.type == BTRFS_INODE_REF_KEY || + key.type == BTRFS_INODE_EXTREF_KEY) { + ret = add_inode_ref(wc->trans, root, log, path, + eb, i, &key); + if (ret && ret != -ENOENT) + break; + ret = 0; + } else if (key.type == BTRFS_EXTENT_DATA_KEY) { + ret = replay_one_extent(wc->trans, root, path, + eb, i, &key); + if (ret) + break; + } + /* + * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the + * BTRFS_DIR_INDEX_KEY items which we use to derive the + * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an + * older kernel with such keys, ignore them. + */ + } + btrfs_free_path(path); + return ret; +} + +/* + * Correctly adjust the reserved bytes occupied by a log tree extent buffer + */ +static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) +{ + struct btrfs_block_group *cache; + + cache = btrfs_lookup_block_group(fs_info, start); + if (!cache) { + btrfs_err(fs_info, "unable to find block group for %llu", start); + return; + } + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->reserved -= fs_info->nodesize; + cache->space_info->bytes_reserved -= fs_info->nodesize; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + btrfs_put_block_group(cache); +} + +static int clean_log_buffer(struct btrfs_trans_handle *trans, + struct extent_buffer *eb) +{ + int ret; + + btrfs_tree_lock(eb); + btrfs_clear_buffer_dirty(trans, eb); + wait_on_extent_buffer_writeback(eb); + btrfs_tree_unlock(eb); + + if (trans) { + ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len); + if (ret) + return ret; + btrfs_redirty_list_add(trans->transaction, eb); + } else { + unaccount_log_buffer(eb->fs_info, eb->start); + } + + return 0; +} + +static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + struct walk_control *wc) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 bytenr; + u64 ptr_gen; + struct extent_buffer *next; + struct extent_buffer *cur; + int ret = 0; + + while (*level > 0) { + struct btrfs_tree_parent_check check = { 0 }; + + cur = path->nodes[*level]; + + WARN_ON(btrfs_header_level(cur) != *level); + + if (path->slots[*level] >= + btrfs_header_nritems(cur)) + break; + + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + check.transid = ptr_gen; + check.level = *level - 1; + check.has_first_key = true; + btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]); + + next = btrfs_find_create_tree_block(fs_info, bytenr, + btrfs_header_owner(cur), + *level - 1); + if (IS_ERR(next)) + return PTR_ERR(next); + + if (*level == 1) { + ret = wc->process_func(root, next, wc, ptr_gen, + *level - 1); + if (ret) { + free_extent_buffer(next); + return ret; + } + + path->slots[*level]++; + if (wc->free) { + ret = btrfs_read_extent_buffer(next, &check); + if (ret) { + free_extent_buffer(next); + return ret; + } + + ret = clean_log_buffer(trans, next); + if (ret) { + free_extent_buffer(next); + return ret; + } + } + free_extent_buffer(next); + continue; + } + ret = btrfs_read_extent_buffer(next, &check); + if (ret) { + free_extent_buffer(next); + return ret; + } + + if (path->nodes[*level-1]) + free_extent_buffer(path->nodes[*level-1]); + path->nodes[*level-1] = next; + *level = btrfs_header_level(next); + path->slots[*level] = 0; + cond_resched(); + } + path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); + + cond_resched(); + return 0; +} + +static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + struct walk_control *wc) +{ + int i; + int slot; + int ret; + + for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { + slot = path->slots[i]; + if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { + path->slots[i]++; + *level = i; + WARN_ON(*level == 0); + return 0; + } else { + ret = wc->process_func(root, path->nodes[*level], wc, + btrfs_header_generation(path->nodes[*level]), + *level); + if (ret) + return ret; + + if (wc->free) { + ret = clean_log_buffer(trans, path->nodes[*level]); + if (ret) + return ret; + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level = i + 1; + } + } + return 1; +} + +/* + * drop the reference count on the tree rooted at 'snap'. This traverses + * the tree freeing any blocks that have a ref count of zero after being + * decremented. + */ +static int walk_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *log, struct walk_control *wc) +{ + int ret = 0; + int wret; + int level; + struct btrfs_path *path; + int orig_level; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + level = btrfs_header_level(log->node); + orig_level = level; + path->nodes[level] = log->node; + atomic_inc(&log->node->refs); + path->slots[level] = 0; + + while (1) { + wret = walk_down_log_tree(trans, log, path, &level, wc); + if (wret > 0) + break; + if (wret < 0) { + ret = wret; + goto out; + } + + wret = walk_up_log_tree(trans, log, path, &level, wc); + if (wret > 0) + break; + if (wret < 0) { + ret = wret; + goto out; + } + } + + /* was the root node processed? if not, catch it here */ + if (path->nodes[orig_level]) { + ret = wc->process_func(log, path->nodes[orig_level], wc, + btrfs_header_generation(path->nodes[orig_level]), + orig_level); + if (ret) + goto out; + if (wc->free) + ret = clean_log_buffer(trans, path->nodes[orig_level]); + } + +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper function to update the item for a given subvolumes log root + * in the tree of log roots + */ +static int update_log_root(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_root_item *root_item) +{ + struct btrfs_fs_info *fs_info = log->fs_info; + int ret; + + if (log->log_transid == 1) { + /* insert root item on the first sync */ + ret = btrfs_insert_root(trans, fs_info->log_root_tree, + &log->root_key, root_item); + } else { + ret = btrfs_update_root(trans, fs_info->log_root_tree, + &log->root_key, root_item); + } + return ret; +} + +static void wait_log_commit(struct btrfs_root *root, int transid) +{ + DEFINE_WAIT(wait); + int index = transid % 2; + + /* + * we only allow two pending log transactions at a time, + * so we know that if ours is more than 2 older than the + * current transaction, we're done + */ + for (;;) { + prepare_to_wait(&root->log_commit_wait[index], + &wait, TASK_UNINTERRUPTIBLE); + + if (!(root->log_transid_committed < transid && + atomic_read(&root->log_commit[index]))) + break; + + mutex_unlock(&root->log_mutex); + schedule(); + mutex_lock(&root->log_mutex); + } + finish_wait(&root->log_commit_wait[index], &wait); +} + +static void wait_for_writer(struct btrfs_root *root) +{ + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&root->log_writer_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (!atomic_read(&root->log_writers)) + break; + + mutex_unlock(&root->log_mutex); + schedule(); + mutex_lock(&root->log_mutex); + } + finish_wait(&root->log_writer_wait, &wait); +} + +static inline void btrfs_remove_log_ctx(struct btrfs_root *root, + struct btrfs_log_ctx *ctx) +{ + mutex_lock(&root->log_mutex); + list_del_init(&ctx->list); + mutex_unlock(&root->log_mutex); +} + +/* + * Invoked in log mutex context, or be sure there is no other task which + * can access the list. + */ +static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, + int index, int error) +{ + struct btrfs_log_ctx *ctx; + struct btrfs_log_ctx *safe; + + list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { + list_del_init(&ctx->list); + ctx->log_ret = error; + } +} + +/* + * btrfs_sync_log does sends a given tree log down to the disk and + * updates the super blocks to record it. When this call is done, + * you know that any inodes previously logged are safely on disk only + * if it returns 0. + * + * Any other return value means you need to call btrfs_commit_transaction. + * Some of the edge cases for fsyncing directories that have had unlinks + * or renames done in the past mean that sometimes the only safe + * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, + * that has happened. + */ +int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_log_ctx *ctx) +{ + int index1; + int index2; + int mark; + int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *log = root->log_root; + struct btrfs_root *log_root_tree = fs_info->log_root_tree; + struct btrfs_root_item new_root_item; + int log_transid = 0; + struct btrfs_log_ctx root_log_ctx; + struct blk_plug plug; + u64 log_root_start; + u64 log_root_level; + + mutex_lock(&root->log_mutex); + log_transid = ctx->log_transid; + if (root->log_transid_committed >= log_transid) { + mutex_unlock(&root->log_mutex); + return ctx->log_ret; + } + + index1 = log_transid % 2; + if (atomic_read(&root->log_commit[index1])) { + wait_log_commit(root, log_transid); + mutex_unlock(&root->log_mutex); + return ctx->log_ret; + } + ASSERT(log_transid == root->log_transid); + atomic_set(&root->log_commit[index1], 1); + + /* wait for previous tree log sync to complete */ + if (atomic_read(&root->log_commit[(index1 + 1) % 2])) + wait_log_commit(root, log_transid - 1); + + while (1) { + int batch = atomic_read(&root->log_batch); + /* when we're on an ssd, just kick the log commit out */ + if (!btrfs_test_opt(fs_info, SSD) && + test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { + mutex_unlock(&root->log_mutex); + schedule_timeout_uninterruptible(1); + mutex_lock(&root->log_mutex); + } + wait_for_writer(root); + if (batch == atomic_read(&root->log_batch)) + break; + } + + /* bail out if we need to do a full commit */ + if (btrfs_need_log_full_commit(trans)) { + ret = BTRFS_LOG_FORCE_COMMIT; + mutex_unlock(&root->log_mutex); + goto out; + } + + if (log_transid % 2 == 0) + mark = EXTENT_DIRTY; + else + mark = EXTENT_NEW; + + /* we start IO on all the marked extents here, but we don't actually + * wait for them until later. + */ + blk_start_plug(&plug); + ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); + /* + * -EAGAIN happens when someone, e.g., a concurrent transaction + * commit, writes a dirty extent in this tree-log commit. This + * concurrent write will create a hole writing out the extents, + * and we cannot proceed on a zoned filesystem, requiring + * sequential writing. While we can bail out to a full commit + * here, but we can continue hoping the concurrent writing fills + * the hole. + */ + if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) + ret = 0; + if (ret) { + blk_finish_plug(&plug); + btrfs_set_log_full_commit(trans); + mutex_unlock(&root->log_mutex); + goto out; + } + + /* + * We _must_ update under the root->log_mutex in order to make sure we + * have a consistent view of the log root we are trying to commit at + * this moment. + * + * We _must_ copy this into a local copy, because we are not holding the + * log_root_tree->log_mutex yet. This is important because when we + * commit the log_root_tree we must have a consistent view of the + * log_root_tree when we update the super block to point at the + * log_root_tree bytenr. If we update the log_root_tree here we'll race + * with the commit and possibly point at the new block which we may not + * have written out. + */ + btrfs_set_root_node(&log->root_item, log->node); + memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); + + root->log_transid++; + log->log_transid = root->log_transid; + root->log_start_pid = 0; + /* + * IO has been started, blocks of the log tree have WRITTEN flag set + * in their headers. new modifications of the log will be written to + * new positions. so it's safe to allow log writers to go in. + */ + mutex_unlock(&root->log_mutex); + + if (btrfs_is_zoned(fs_info)) { + mutex_lock(&fs_info->tree_root->log_mutex); + if (!log_root_tree->node) { + ret = btrfs_alloc_log_tree_node(trans, log_root_tree); + if (ret) { + mutex_unlock(&fs_info->tree_root->log_mutex); + blk_finish_plug(&plug); + goto out; + } + } + mutex_unlock(&fs_info->tree_root->log_mutex); + } + + btrfs_init_log_ctx(&root_log_ctx, NULL); + + mutex_lock(&log_root_tree->log_mutex); + + index2 = log_root_tree->log_transid % 2; + list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); + root_log_ctx.log_transid = log_root_tree->log_transid; + + /* + * Now we are safe to update the log_root_tree because we're under the + * log_mutex, and we're a current writer so we're holding the commit + * open until we drop the log_mutex. + */ + ret = update_log_root(trans, log, &new_root_item); + if (ret) { + if (!list_empty(&root_log_ctx.list)) + list_del_init(&root_log_ctx.list); + + blk_finish_plug(&plug); + btrfs_set_log_full_commit(trans); + if (ret != -ENOSPC) + btrfs_err(fs_info, + "failed to update log for root %llu ret %d", + root->root_key.objectid, ret); + btrfs_wait_tree_log_extents(log, mark); + mutex_unlock(&log_root_tree->log_mutex); + goto out; + } + + if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { + blk_finish_plug(&plug); + list_del_init(&root_log_ctx.list); + mutex_unlock(&log_root_tree->log_mutex); + ret = root_log_ctx.log_ret; + goto out; + } + + index2 = root_log_ctx.log_transid % 2; + if (atomic_read(&log_root_tree->log_commit[index2])) { + blk_finish_plug(&plug); + ret = btrfs_wait_tree_log_extents(log, mark); + wait_log_commit(log_root_tree, + root_log_ctx.log_transid); + mutex_unlock(&log_root_tree->log_mutex); + if (!ret) + ret = root_log_ctx.log_ret; + goto out; + } + ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); + atomic_set(&log_root_tree->log_commit[index2], 1); + + if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { + wait_log_commit(log_root_tree, + root_log_ctx.log_transid - 1); + } + + /* + * now that we've moved on to the tree of log tree roots, + * check the full commit flag again + */ + if (btrfs_need_log_full_commit(trans)) { + blk_finish_plug(&plug); + btrfs_wait_tree_log_extents(log, mark); + mutex_unlock(&log_root_tree->log_mutex); + ret = BTRFS_LOG_FORCE_COMMIT; + goto out_wake_log_root; + } + + ret = btrfs_write_marked_extents(fs_info, + &log_root_tree->dirty_log_pages, + EXTENT_DIRTY | EXTENT_NEW); + blk_finish_plug(&plug); + /* + * As described above, -EAGAIN indicates a hole in the extents. We + * cannot wait for these write outs since the waiting cause a + * deadlock. Bail out to the full commit instead. + */ + if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { + btrfs_set_log_full_commit(trans); + btrfs_wait_tree_log_extents(log, mark); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } else if (ret) { + btrfs_set_log_full_commit(trans); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } + ret = btrfs_wait_tree_log_extents(log, mark); + if (!ret) + ret = btrfs_wait_tree_log_extents(log_root_tree, + EXTENT_NEW | EXTENT_DIRTY); + if (ret) { + btrfs_set_log_full_commit(trans); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } + + log_root_start = log_root_tree->node->start; + log_root_level = btrfs_header_level(log_root_tree->node); + log_root_tree->log_transid++; + mutex_unlock(&log_root_tree->log_mutex); + + /* + * Here we are guaranteed that nobody is going to write the superblock + * for the current transaction before us and that neither we do write + * our superblock before the previous transaction finishes its commit + * and writes its superblock, because: + * + * 1) We are holding a handle on the current transaction, so no body + * can commit it until we release the handle; + * + * 2) Before writing our superblock we acquire the tree_log_mutex, so + * if the previous transaction is still committing, and hasn't yet + * written its superblock, we wait for it to do it, because a + * transaction commit acquires the tree_log_mutex when the commit + * begins and releases it only after writing its superblock. + */ + mutex_lock(&fs_info->tree_log_mutex); + + /* + * The previous transaction writeout phase could have failed, and thus + * marked the fs in an error state. We must not commit here, as we + * could have updated our generation in the super_for_commit and + * writing the super here would result in transid mismatches. If there + * is an error here just bail. + */ + if (BTRFS_FS_ERROR(fs_info)) { + ret = -EIO; + btrfs_set_log_full_commit(trans); + btrfs_abort_transaction(trans, ret); + mutex_unlock(&fs_info->tree_log_mutex); + goto out_wake_log_root; + } + + btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); + btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); + ret = write_all_supers(fs_info, 1); + mutex_unlock(&fs_info->tree_log_mutex); + if (ret) { + btrfs_set_log_full_commit(trans); + btrfs_abort_transaction(trans, ret); + goto out_wake_log_root; + } + + /* + * We know there can only be one task here, since we have not yet set + * root->log_commit[index1] to 0 and any task attempting to sync the + * log must wait for the previous log transaction to commit if it's + * still in progress or wait for the current log transaction commit if + * someone else already started it. We use <= and not < because the + * first log transaction has an ID of 0. + */ + ASSERT(root->last_log_commit <= log_transid); + root->last_log_commit = log_transid; + +out_wake_log_root: + mutex_lock(&log_root_tree->log_mutex); + btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); + + log_root_tree->log_transid_committed++; + atomic_set(&log_root_tree->log_commit[index2], 0); + mutex_unlock(&log_root_tree->log_mutex); + + /* + * The barrier before waitqueue_active (in cond_wake_up) is needed so + * all the updates above are seen by the woken threads. It might not be + * necessary, but proving that seems to be hard. + */ + cond_wake_up(&log_root_tree->log_commit_wait[index2]); +out: + mutex_lock(&root->log_mutex); + btrfs_remove_all_log_ctxs(root, index1, ret); + root->log_transid_committed++; + atomic_set(&root->log_commit[index1], 0); + mutex_unlock(&root->log_mutex); + + /* + * The barrier before waitqueue_active (in cond_wake_up) is needed so + * all the updates above are seen by the woken threads. It might not be + * necessary, but proving that seems to be hard. + */ + cond_wake_up(&root->log_commit_wait[index1]); + return ret; +} + +static void free_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *log) +{ + int ret; + struct walk_control wc = { + .free = 1, + .process_func = process_one_buffer + }; + + if (log->node) { + ret = walk_log_tree(trans, log, &wc); + if (ret) { + /* + * We weren't able to traverse the entire log tree, the + * typical scenario is getting an -EIO when reading an + * extent buffer of the tree, due to a previous writeback + * failure of it. + */ + set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + &log->fs_info->fs_state); + + /* + * Some extent buffers of the log tree may still be dirty + * and not yet written back to storage, because we may + * have updates to a log tree without syncing a log tree, + * such as during rename and link operations. So flush + * them out and wait for their writeback to complete, so + * that we properly cleanup their state and pages. + */ + btrfs_write_marked_extents(log->fs_info, + &log->dirty_log_pages, + EXTENT_DIRTY | EXTENT_NEW); + btrfs_wait_tree_log_extents(log, + EXTENT_DIRTY | EXTENT_NEW); + + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(log->fs_info, ret, NULL); + } + } + + clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); + extent_io_tree_release(&log->log_csum_range); + + btrfs_put_root(log); +} + +/* + * free all the extents used by the tree log. This should be called + * at commit time of the full transaction + */ +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +{ + if (root->log_root) { + free_log_tree(trans, root->log_root); + root->log_root = NULL; + clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); + } + return 0; +} + +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + if (fs_info->log_root_tree) { + free_log_tree(trans, fs_info->log_root_tree); + fs_info->log_root_tree = NULL; + clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); + } + return 0; +} + +/* + * Check if an inode was logged in the current transaction. This correctly deals + * with the case where the inode was logged but has a logged_trans of 0, which + * happens if the inode is evicted and loaded again, as logged_trans is an in + * memory only field (not persisted). + * + * Returns 1 if the inode was logged before in the transaction, 0 if it was not, + * and < 0 on error. + */ +static int inode_logged(const struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path_in) +{ + struct btrfs_path *path = path_in; + struct btrfs_key key; + int ret; + + if (inode->logged_trans == trans->transid) + return 1; + + /* + * If logged_trans is not 0, then we know the inode logged was not logged + * in this transaction, so we can return false right away. + */ + if (inode->logged_trans > 0) + return 0; + + /* + * If no log tree was created for this root in this transaction, then + * the inode can not have been logged in this transaction. In that case + * set logged_trans to anything greater than 0 and less than the current + * transaction's ID, to avoid the search below in a future call in case + * a log tree gets created after this. + */ + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { + inode->logged_trans = trans->transid - 1; + return 0; + } + + /* + * We have a log tree and the inode's logged_trans is 0. We can't tell + * for sure if the inode was logged before in this transaction by looking + * only at logged_trans. We could be pessimistic and assume it was, but + * that can lead to unnecessarily logging an inode during rename and link + * operations, and then further updating the log in followup rename and + * link operations, specially if it's a directory, which adds latency + * visible to applications doing a series of rename or link operations. + * + * A logged_trans of 0 here can mean several things: + * + * 1) The inode was never logged since the filesystem was mounted, and may + * or may have not been evicted and loaded again; + * + * 2) The inode was logged in a previous transaction, then evicted and + * then loaded again; + * + * 3) The inode was logged in the current transaction, then evicted and + * then loaded again. + * + * For cases 1) and 2) we don't want to return true, but we need to detect + * case 3) and return true. So we do a search in the log root for the inode + * item. + */ + key.objectid = btrfs_ino(inode); + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } + + ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); + + if (path_in) + btrfs_release_path(path); + else + btrfs_free_path(path); + + /* + * Logging an inode always results in logging its inode item. So if we + * did not find the item we know the inode was not logged for sure. + */ + if (ret < 0) { + return ret; + } else if (ret > 0) { + /* + * Set logged_trans to a value greater than 0 and less then the + * current transaction to avoid doing the search in future calls. + */ + inode->logged_trans = trans->transid - 1; + return 0; + } + + /* + * The inode was previously logged and then evicted, set logged_trans to + * the current transacion's ID, to avoid future tree searches as long as + * the inode is not evicted again. + */ + inode->logged_trans = trans->transid; + + /* + * If it's a directory, then we must set last_dir_index_offset to the + * maximum possible value, so that the next attempt to log the inode does + * not skip checking if dir index keys found in modified subvolume tree + * leaves have been logged before, otherwise it would result in attempts + * to insert duplicate dir index keys in the log tree. This must be done + * because last_dir_index_offset is an in-memory only field, not persisted + * in the inode item or any other on-disk structure, so its value is lost + * once the inode is evicted. + */ + if (S_ISDIR(inode->vfs_inode.i_mode)) + inode->last_dir_index_offset = (u64)-1; + + return 1; +} + +/* + * Delete a directory entry from the log if it exists. + * + * Returns < 0 on error + * 1 if the entry does not exists + * 0 if the entry existed and was successfully deleted + */ +static int del_logged_dentry(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dir_ino, + const struct fscrypt_str *name, + u64 index) +{ + struct btrfs_dir_item *di; + + /* + * We only log dir index items of a directory, so we don't need to look + * for dir item keys. + */ + di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, + index, name, -1); + if (IS_ERR(di)) + return PTR_ERR(di); + else if (!di) + return 1; + + /* + * We do not need to update the size field of the directory's + * inode item because on log replay we update the field to reflect + * all existing entries in the directory (see overwrite_item()). + */ + return btrfs_delete_one_dir_name(trans, log, path, di); +} + +/* + * If both a file and directory are logged, and unlinks or renames are + * mixed in, we have a few interesting corners: + * + * create file X in dir Y + * link file X to X.link in dir Y + * fsync file X + * unlink file X but leave X.link + * fsync dir Y + * + * After a crash we would expect only X.link to exist. But file X + * didn't get fsync'd again so the log has back refs for X and X.link. + * + * We solve this by removing directory entries and inode backrefs from the + * log when a file that was logged in the current transaction is + * unlinked. Any later fsync will include the updated log entries, and + * we'll be able to reconstruct the proper directory items from backrefs. + * + * This optimizations allows us to avoid relogging the entire inode + * or the entire directory. + */ +void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const struct fscrypt_str *name, + struct btrfs_inode *dir, u64 index) +{ + struct btrfs_path *path; + int ret; + + ret = inode_logged(trans, dir, NULL); + if (ret == 0) + return; + else if (ret < 0) { + btrfs_set_log_full_commit(trans); + return; + } + + ret = join_running_log_trans(root); + if (ret) + return; + + mutex_lock(&dir->log_mutex); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out_unlock; + } + + ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), + name, index); + btrfs_free_path(path); +out_unlock: + mutex_unlock(&dir->log_mutex); + if (ret < 0) + btrfs_set_log_full_commit(trans); + btrfs_end_log_trans(root); +} + +/* see comments for btrfs_del_dir_entries_in_log */ +void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const struct fscrypt_str *name, + struct btrfs_inode *inode, u64 dirid) +{ + struct btrfs_root *log; + u64 index; + int ret; + + ret = inode_logged(trans, inode, NULL); + if (ret == 0) + return; + else if (ret < 0) { + btrfs_set_log_full_commit(trans); + return; + } + + ret = join_running_log_trans(root); + if (ret) + return; + log = root->log_root; + mutex_lock(&inode->log_mutex); + + ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), + dirid, &index); + mutex_unlock(&inode->log_mutex); + if (ret < 0 && ret != -ENOENT) + btrfs_set_log_full_commit(trans); + btrfs_end_log_trans(root); +} + +/* + * creates a range item in the log for 'dirid'. first_offset and + * last_offset tell us which parts of the key space the log should + * be considered authoritative for. + */ +static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dirid, + u64 first_offset, u64 last_offset) +{ + int ret; + struct btrfs_key key; + struct btrfs_dir_log_item *item; + + key.objectid = dirid; + key.offset = first_offset; + key.type = BTRFS_DIR_LOG_INDEX_KEY; + ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); + /* + * -EEXIST is fine and can happen sporadically when we are logging a + * directory and have concurrent insertions in the subvolume's tree for + * items from other inodes and that result in pushing off some dir items + * from one leaf to another in order to accommodate for the new items. + * This results in logging the same dir index range key. + */ + if (ret && ret != -EEXIST) + return ret; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + if (ret == -EEXIST) { + const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item); + + /* + * btrfs_del_dir_entries_in_log() might have been called during + * an unlink between the initial insertion of this key and the + * current update, or we might be logging a single entry deletion + * during a rename, so set the new last_offset to the max value. + */ + last_offset = max(last_offset, curr_end); + } + btrfs_set_dir_log_end(path->nodes[0], item, last_offset); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); + btrfs_release_path(path); + return 0; +} + +static int flush_dir_items_batch(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct extent_buffer *src, + struct btrfs_path *dst_path, + int start_slot, + int count) +{ + struct btrfs_root *log = inode->root->log_root; + char *ins_data = NULL; + struct btrfs_item_batch batch; + struct extent_buffer *dst; + unsigned long src_offset; + unsigned long dst_offset; + u64 last_index; + struct btrfs_key key; + u32 item_size; + int ret; + int i; + + ASSERT(count > 0); + batch.nr = count; + + if (count == 1) { + btrfs_item_key_to_cpu(src, &key, start_slot); + item_size = btrfs_item_size(src, start_slot); + batch.keys = &key; + batch.data_sizes = &item_size; + batch.total_data_size = item_size; + } else { + struct btrfs_key *ins_keys; + u32 *ins_sizes; + + ins_data = kmalloc(count * sizeof(u32) + + count * sizeof(struct btrfs_key), GFP_NOFS); + if (!ins_data) + return -ENOMEM; + + ins_sizes = (u32 *)ins_data; + ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32)); + batch.keys = ins_keys; + batch.data_sizes = ins_sizes; + batch.total_data_size = 0; + + for (i = 0; i < count; i++) { + const int slot = start_slot + i; + + btrfs_item_key_to_cpu(src, &ins_keys[i], slot); + ins_sizes[i] = btrfs_item_size(src, slot); + batch.total_data_size += ins_sizes[i]; + } + } + + ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); + if (ret) + goto out; + + dst = dst_path->nodes[0]; + /* + * Copy all the items in bulk, in a single copy operation. Item data is + * organized such that it's placed at the end of a leaf and from right + * to left. For example, the data for the second item ends at an offset + * that matches the offset where the data for the first item starts, the + * data for the third item ends at an offset that matches the offset + * where the data of the second items starts, and so on. + * Therefore our source and destination start offsets for copy match the + * offsets of the last items (highest slots). + */ + dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1); + src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); + copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); + btrfs_release_path(dst_path); + + last_index = batch.keys[count - 1].offset; + ASSERT(last_index > inode->last_dir_index_offset); + + /* + * If for some unexpected reason the last item's index is not greater + * than the last index we logged, warn and force a transaction commit. + */ + if (WARN_ON(last_index <= inode->last_dir_index_offset)) + ret = BTRFS_LOG_FORCE_COMMIT; + else + inode->last_dir_index_offset = last_index; + + if (btrfs_get_first_dir_index_to_log(inode) == 0) + btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset); +out: + kfree(ins_data); + + return ret; +} + +static int process_dir_items_leaf(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path, + struct btrfs_log_ctx *ctx, + u64 *last_old_dentry_offset) +{ + struct btrfs_root *log = inode->root->log_root; + struct extent_buffer *src; + const int nritems = btrfs_header_nritems(path->nodes[0]); + const u64 ino = btrfs_ino(inode); + bool last_found = false; + int batch_start = 0; + int batch_size = 0; + int i; + + /* + * We need to clone the leaf, release the read lock on it, and use the + * clone before modifying the log tree. See the comment at copy_items() + * about why we need to do this. + */ + src = btrfs_clone_extent_buffer(path->nodes[0]); + if (!src) + return -ENOMEM; + + i = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = src; + path->slots[0] = i; + + for (; i < nritems; i++) { + struct btrfs_dir_item *di; + struct btrfs_key key; + int ret; + + btrfs_item_key_to_cpu(src, &key, i); + + if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) { + last_found = true; + break; + } + + di = btrfs_item_ptr(src, i, struct btrfs_dir_item); + + /* + * Skip ranges of items that consist only of dir item keys created + * in past transactions. However if we find a gap, we must log a + * dir index range item for that gap, so that index keys in that + * gap are deleted during log replay. + */ + if (btrfs_dir_transid(src, di) < trans->transid) { + if (key.offset > *last_old_dentry_offset + 1) { + ret = insert_dir_log_key(trans, log, dst_path, + ino, *last_old_dentry_offset + 1, + key.offset - 1); + if (ret < 0) + return ret; + } + + *last_old_dentry_offset = key.offset; + continue; + } + + /* If we logged this dir index item before, we can skip it. */ + if (key.offset <= inode->last_dir_index_offset) + continue; + + /* + * We must make sure that when we log a directory entry, the + * corresponding inode, after log replay, has a matching link + * count. For example: + * + * touch foo + * mkdir mydir + * sync + * ln foo mydir/bar + * xfs_io -c "fsync" mydir + * + * + * + * Would result in a fsync log that when replayed, our file inode + * would have a link count of 1, but we get two directory entries + * pointing to the same inode. After removing one of the names, + * it would not be possible to remove the other name, which + * resulted always in stale file handle errors, and would not be + * possible to rmdir the parent directory, since its i_size could + * never be decremented to the value BTRFS_EMPTY_DIR_SIZE, + * resulting in -ENOTEMPTY errors. + */ + if (!ctx->log_new_dentries) { + struct btrfs_key di_key; + + btrfs_dir_item_key_to_cpu(src, di, &di_key); + if (di_key.type != BTRFS_ROOT_ITEM_KEY) + ctx->log_new_dentries = true; + } + + if (batch_size == 0) + batch_start = i; + batch_size++; + } + + if (batch_size > 0) { + int ret; + + ret = flush_dir_items_batch(trans, inode, src, dst_path, + batch_start, batch_size); + if (ret < 0) + return ret; + } + + return last_found ? 1 : 0; +} + +/* + * log all the items included in the current transaction for a given + * directory. This also creates the range items in the log tree required + * to replay anything deleted before the fsync + */ +static noinline int log_dir_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path, + struct btrfs_log_ctx *ctx, + u64 min_offset, u64 *last_offset_ret) +{ + struct btrfs_key min_key; + struct btrfs_root *root = inode->root; + struct btrfs_root *log = root->log_root; + int ret; + u64 last_old_dentry_offset = min_offset - 1; + u64 last_offset = (u64)-1; + u64 ino = btrfs_ino(inode); + + min_key.objectid = ino; + min_key.type = BTRFS_DIR_INDEX_KEY; + min_key.offset = min_offset; + + ret = btrfs_search_forward(root, &min_key, path, trans->transid); + + /* + * we didn't find anything from this transaction, see if there + * is anything at all + */ + if (ret != 0 || min_key.objectid != ino || + min_key.type != BTRFS_DIR_INDEX_KEY) { + min_key.objectid = ino; + min_key.type = BTRFS_DIR_INDEX_KEY; + min_key.offset = (u64)-1; + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret < 0) { + btrfs_release_path(path); + return ret; + } + ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); + + /* if ret == 0 there are items for this type, + * create a range to tell us the last key of this type. + * otherwise, there are no items in this directory after + * *min_offset, and we create a range to indicate that. + */ + if (ret == 0) { + struct btrfs_key tmp; + + btrfs_item_key_to_cpu(path->nodes[0], &tmp, + path->slots[0]); + if (tmp.type == BTRFS_DIR_INDEX_KEY) + last_old_dentry_offset = tmp.offset; + } else if (ret > 0) { + ret = 0; + } + + goto done; + } + + /* go backward to find any previous key */ + ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); + if (ret == 0) { + struct btrfs_key tmp; + + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); + /* + * The dir index key before the first one we found that needs to + * be logged might be in a previous leaf, and there might be a + * gap between these keys, meaning that we had deletions that + * happened. So the key range item we log (key type + * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the + * previous key's offset plus 1, so that those deletes are replayed. + */ + if (tmp.type == BTRFS_DIR_INDEX_KEY) + last_old_dentry_offset = tmp.offset; + } else if (ret < 0) { + goto done; + } + + btrfs_release_path(path); + + /* + * Find the first key from this transaction again or the one we were at + * in the loop below in case we had to reschedule. We may be logging the + * directory without holding its VFS lock, which happen when logging new + * dentries (through log_new_dir_dentries()) or in some cases when we + * need to log the parent directory of an inode. This means a dir index + * key might be deleted from the inode's root, and therefore we may not + * find it anymore. If we can't find it, just move to the next key. We + * can not bail out and ignore, because if we do that we will simply + * not log dir index keys that come after the one that was just deleted + * and we can end up logging a dir index range that ends at (u64)-1 + * (@last_offset is initialized to that), resulting in removing dir + * entries we should not remove at log replay time. + */ +search: + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret > 0) { + ret = btrfs_next_item(root, path); + if (ret > 0) { + /* There are no more keys in the inode's root. */ + ret = 0; + goto done; + } + } + if (ret < 0) + goto done; + + /* + * we have a block from this transaction, log every item in it + * from our directory + */ + while (1) { + ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, + &last_old_dentry_offset); + if (ret != 0) { + if (ret > 0) + ret = 0; + goto done; + } + path->slots[0] = btrfs_header_nritems(path->nodes[0]); + + /* + * look ahead to the next item and see if it is also + * from this directory and from this transaction + */ + ret = btrfs_next_leaf(root, path); + if (ret) { + if (ret == 1) { + last_offset = (u64)-1; + ret = 0; + } + goto done; + } + btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); + if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) { + last_offset = (u64)-1; + goto done; + } + if (btrfs_header_generation(path->nodes[0]) != trans->transid) { + /* + * The next leaf was not changed in the current transaction + * and has at least one dir index key. + * We check for the next key because there might have been + * one or more deletions between the last key we logged and + * that next key. So the key range item we log (key type + * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's + * offset minus 1, so that those deletes are replayed. + */ + last_offset = min_key.offset - 1; + goto done; + } + if (need_resched()) { + btrfs_release_path(path); + cond_resched(); + goto search; + } + } +done: + btrfs_release_path(path); + btrfs_release_path(dst_path); + + if (ret == 0) { + *last_offset_ret = last_offset; + /* + * In case the leaf was changed in the current transaction but + * all its dir items are from a past transaction, the last item + * in the leaf is a dir item and there's no gap between that last + * dir item and the first one on the next leaf (which did not + * change in the current transaction), then we don't need to log + * a range, last_old_dentry_offset is == to last_offset. + */ + ASSERT(last_old_dentry_offset <= last_offset); + if (last_old_dentry_offset < last_offset) + ret = insert_dir_log_key(trans, log, path, ino, + last_old_dentry_offset + 1, + last_offset); + } + + return ret; +} + +/* + * If the inode was logged before and it was evicted, then its + * last_dir_index_offset is (u64)-1, so we don't the value of the last index + * key offset. If that's the case, search for it and update the inode. This + * is to avoid lookups in the log tree every time we try to insert a dir index + * key from a leaf changed in the current transaction, and to allow us to always + * do batch insertions of dir index keys. + */ +static int update_last_dir_index_offset(struct btrfs_inode *inode, + struct btrfs_path *path, + const struct btrfs_log_ctx *ctx) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_key key; + int ret; + + lockdep_assert_held(&inode->log_mutex); + + if (inode->last_dir_index_offset != (u64)-1) + return 0; + + if (!ctx->logged_before) { + inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; + return 0; + } + + key.objectid = ino; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); + /* + * An error happened or we actually have an index key with an offset + * value of (u64)-1. Bail out, we're done. + */ + if (ret <= 0) + goto out; + + ret = 0; + inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; + + /* + * No dir index items, bail out and leave last_dir_index_offset with + * the value right before the first valid index value. + */ + if (path->slots[0] == 0) + goto out; + + /* + * btrfs_search_slot() left us at one slot beyond the slot with the last + * index key, or beyond the last key of the directory that is not an + * index key. If we have an index key before, set last_dir_index_offset + * to its offset value, otherwise leave it with a value right before the + * first valid index value, as it means we have an empty directory. + */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY) + inode->last_dir_index_offset = key.offset; + +out: + btrfs_release_path(path); + + return ret; +} + +/* + * logging directories is very similar to logging inodes, We find all the items + * from the current transaction and write them to the log. + * + * The recovery code scans the directory in the subvolume, and if it finds a + * key in the range logged that is not present in the log tree, then it means + * that dir entry was unlinked during the transaction. + * + * In order for that scan to work, we must include one key smaller than + * the smallest logged by this transaction and one key larger than the largest + * key logged by this transaction. + */ +static noinline int log_directory_changes(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path, + struct btrfs_log_ctx *ctx) +{ + u64 min_key; + u64 max_key; + int ret; + + ret = update_last_dir_index_offset(inode, path, ctx); + if (ret) + return ret; + + min_key = BTRFS_DIR_START_INDEX; + max_key = 0; + + while (1) { + ret = log_dir_items(trans, inode, path, dst_path, + ctx, min_key, &max_key); + if (ret) + return ret; + if (max_key == (u64)-1) + break; + min_key = max_key + 1; + } + + return 0; +} + +/* + * a helper function to drop items from the log before we relog an + * inode. max_key_type indicates the highest item type to remove. + * This cannot be run for file data extents because it does not + * free the extents they point to. + */ +static int drop_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + struct btrfs_inode *inode, + int max_key_type) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + int start_slot; + + key.objectid = btrfs_ino(inode); + key.type = max_key_type; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(trans, log, &key, path, -1, 1); + if (ret < 0) { + break; + } else if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + + if (found_key.objectid != key.objectid) + break; + + found_key.offset = 0; + found_key.type = 0; + ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot); + if (ret < 0) + break; + + ret = btrfs_del_items(trans, log, path, start_slot, + path->slots[0] - start_slot + 1); + /* + * If start slot isn't 0 then we don't need to re-search, we've + * found the last guy with the objectid in this tree. + */ + if (ret || start_slot != 0) + break; + btrfs_release_path(path); + } + btrfs_release_path(path); + if (ret > 0) + ret = 0; + return ret; +} + +static int truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *log_root, + struct btrfs_inode *inode, + u64 new_size, u32 min_type) +{ + struct btrfs_truncate_control control = { + .new_size = new_size, + .ino = btrfs_ino(inode), + .min_type = min_type, + .skip_ref_updates = true, + }; + + return btrfs_truncate_inode_items(trans, log_root, &control); +} + +static void fill_inode_item(struct btrfs_trans_handle *trans, + struct extent_buffer *leaf, + struct btrfs_inode_item *item, + struct inode *inode, int log_inode_only, + u64 logged_isize) +{ + struct btrfs_map_token token; + u64 flags; + + btrfs_init_map_token(&token, leaf); + + if (log_inode_only) { + /* set the generation to zero so the recover code + * can tell the difference between an logging + * just to say 'this inode exists' and a logging + * to say 'update this inode with these values' + */ + btrfs_set_token_inode_generation(&token, item, 0); + btrfs_set_token_inode_size(&token, item, logged_isize); + } else { + btrfs_set_token_inode_generation(&token, item, + BTRFS_I(inode)->generation); + btrfs_set_token_inode_size(&token, item, inode->i_size); + } + + btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); + btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); + btrfs_set_token_inode_mode(&token, item, inode->i_mode); + btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); + + btrfs_set_token_timespec_sec(&token, &item->atime, + inode->i_atime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->atime, + inode->i_atime.tv_nsec); + + btrfs_set_token_timespec_sec(&token, &item->mtime, + inode->i_mtime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->mtime, + inode->i_mtime.tv_nsec); + + btrfs_set_token_timespec_sec(&token, &item->ctime, + inode_get_ctime(inode).tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->ctime, + inode_get_ctime(inode).tv_nsec); + + /* + * We do not need to set the nbytes field, in fact during a fast fsync + * its value may not even be correct, since a fast fsync does not wait + * for ordered extent completion, which is where we update nbytes, it + * only waits for writeback to complete. During log replay as we find + * file extent items and replay them, we adjust the nbytes field of the + * inode item in subvolume tree as needed (see overwrite_item()). + */ + + btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); + btrfs_set_token_inode_transid(&token, item, trans->transid); + btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); + flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, + BTRFS_I(inode)->ro_flags); + btrfs_set_token_inode_flags(&token, item, flags); + btrfs_set_token_inode_block_group(&token, item, 0); +} + +static int log_inode_item(struct btrfs_trans_handle *trans, + struct btrfs_root *log, struct btrfs_path *path, + struct btrfs_inode *inode, bool inode_item_dropped) +{ + struct btrfs_inode_item *inode_item; + int ret; + + /* + * If we are doing a fast fsync and the inode was logged before in the + * current transaction, then we know the inode was previously logged and + * it exists in the log tree. For performance reasons, in this case use + * btrfs_search_slot() directly with ins_len set to 0 so that we never + * attempt a write lock on the leaf's parent, which adds unnecessary lock + * contention in case there are concurrent fsyncs for other inodes of the + * same subvolume. Using btrfs_insert_empty_item() when the inode item + * already exists can also result in unnecessarily splitting a leaf. + */ + if (!inode_item_dropped && inode->logged_trans == trans->transid) { + ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); + ASSERT(ret <= 0); + if (ret > 0) + ret = -ENOENT; + } else { + /* + * This means it is the first fsync in the current transaction, + * so the inode item is not in the log and we need to insert it. + * We can never get -EEXIST because we are only called for a fast + * fsync and in case an inode eviction happens after the inode was + * logged before in the current transaction, when we load again + * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime + * flags and set ->logged_trans to 0. + */ + ret = btrfs_insert_empty_item(trans, log, path, &inode->location, + sizeof(*inode_item)); + ASSERT(ret != -EEXIST); + } + if (ret) + return ret; + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, + 0, 0); + btrfs_release_path(path); + return 0; +} + +static int log_csums(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_root *log_root, + struct btrfs_ordered_sum *sums) +{ + const u64 lock_end = sums->logical + sums->len - 1; + struct extent_state *cached_state = NULL; + int ret; + + /* + * If this inode was not used for reflink operations in the current + * transaction with new extents, then do the fast path, no need to + * worry about logging checksum items with overlapping ranges. + */ + if (inode->last_reflink_trans < trans->transid) + return btrfs_csum_file_blocks(trans, log_root, sums); + + /* + * Serialize logging for checksums. This is to avoid racing with the + * same checksum being logged by another task that is logging another + * file which happens to refer to the same extent as well. Such races + * can leave checksum items in the log with overlapping ranges. + */ + ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end, + &cached_state); + if (ret) + return ret; + /* + * Due to extent cloning, we might have logged a csum item that covers a + * subrange of a cloned extent, and later we can end up logging a csum + * item for a larger subrange of the same extent or the entire range. + * This would leave csum items in the log tree that cover the same range + * and break the searches for checksums in the log tree, resulting in + * some checksums missing in the fs/subvolume tree. So just delete (or + * trim and adjust) any existing csum items in the log for this range. + */ + ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len); + if (!ret) + ret = btrfs_csum_file_blocks(trans, log_root, sums); + + unlock_extent(&log_root->log_csum_range, sums->logical, lock_end, + &cached_state); + + return ret; +} + +static noinline int copy_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *dst_path, + struct btrfs_path *src_path, + int start_slot, int nr, int inode_only, + u64 logged_isize) +{ + struct btrfs_root *log = inode->root->log_root; + struct btrfs_file_extent_item *extent; + struct extent_buffer *src; + int ret = 0; + struct btrfs_key *ins_keys; + u32 *ins_sizes; + struct btrfs_item_batch batch; + char *ins_data; + int i; + int dst_index; + const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); + const u64 i_size = i_size_read(&inode->vfs_inode); + + /* + * To keep lockdep happy and avoid deadlocks, clone the source leaf and + * use the clone. This is because otherwise we would be changing the log + * tree, to insert items from the subvolume tree or insert csum items, + * while holding a read lock on a leaf from the subvolume tree, which + * creates a nasty lock dependency when COWing log tree nodes/leaves: + * + * 1) Modifying the log tree triggers an extent buffer allocation while + * holding a write lock on a parent extent buffer from the log tree. + * Allocating the pages for an extent buffer, or the extent buffer + * struct, can trigger inode eviction and finally the inode eviction + * will trigger a release/remove of a delayed node, which requires + * taking the delayed node's mutex; + * + * 2) Allocating a metadata extent for a log tree can trigger the async + * reclaim thread and make us wait for it to release enough space and + * unblock our reservation ticket. The reclaim thread can start + * flushing delayed items, and that in turn results in the need to + * lock delayed node mutexes and in the need to write lock extent + * buffers of a subvolume tree - all this while holding a write lock + * on the parent extent buffer in the log tree. + * + * So one task in scenario 1) running in parallel with another task in + * scenario 2) could lead to a deadlock, one wanting to lock a delayed + * node mutex while having a read lock on a leaf from the subvolume, + * while the other is holding the delayed node's mutex and wants to + * write lock the same subvolume leaf for flushing delayed items. + */ + src = btrfs_clone_extent_buffer(src_path->nodes[0]); + if (!src) + return -ENOMEM; + + i = src_path->slots[0]; + btrfs_release_path(src_path); + src_path->nodes[0] = src; + src_path->slots[0] = i; + + ins_data = kmalloc(nr * sizeof(struct btrfs_key) + + nr * sizeof(u32), GFP_NOFS); + if (!ins_data) + return -ENOMEM; + + ins_sizes = (u32 *)ins_data; + ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); + batch.keys = ins_keys; + batch.data_sizes = ins_sizes; + batch.total_data_size = 0; + batch.nr = 0; + + dst_index = 0; + for (i = 0; i < nr; i++) { + const int src_slot = start_slot + i; + struct btrfs_root *csum_root; + struct btrfs_ordered_sum *sums; + struct btrfs_ordered_sum *sums_next; + LIST_HEAD(ordered_sums); + u64 disk_bytenr; + u64 disk_num_bytes; + u64 extent_offset; + u64 extent_num_bytes; + bool is_old_extent; + + btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot); + + if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY) + goto add_to_batch; + + extent = btrfs_item_ptr(src, src_slot, + struct btrfs_file_extent_item); + + is_old_extent = (btrfs_file_extent_generation(src, extent) < + trans->transid); + + /* + * Don't copy extents from past generations. That would make us + * log a lot more metadata for common cases like doing only a + * few random writes into a file and then fsync it for the first + * time or after the full sync flag is set on the inode. We can + * get leaves full of extent items, most of which are from past + * generations, so we can skip them - as long as the inode has + * not been the target of a reflink operation in this transaction, + * as in that case it might have had file extent items with old + * generations copied into it. We also must always log prealloc + * extents that start at or beyond eof, otherwise we would lose + * them on log replay. + */ + if (is_old_extent && + ins_keys[dst_index].offset < i_size && + inode->last_reflink_trans < trans->transid) + continue; + + if (skip_csum) + goto add_to_batch; + + /* Only regular extents have checksums. */ + if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG) + goto add_to_batch; + + /* + * If it's an extent created in a past transaction, then its + * checksums are already accessible from the committed csum tree, + * no need to log them. + */ + if (is_old_extent) + goto add_to_batch; + + disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent); + /* If it's an explicit hole, there are no checksums. */ + if (disk_bytenr == 0) + goto add_to_batch; + + disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent); + + if (btrfs_file_extent_compression(src, extent)) { + extent_offset = 0; + extent_num_bytes = disk_num_bytes; + } else { + extent_offset = btrfs_file_extent_offset(src, extent); + extent_num_bytes = btrfs_file_extent_num_bytes(src, extent); + } + + csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); + disk_bytenr += extent_offset; + ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, + disk_bytenr + extent_num_bytes - 1, + &ordered_sums, 0, false); + if (ret) + goto out; + + list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { + if (!ret) + ret = log_csums(trans, inode, log, sums); + list_del(&sums->list); + kfree(sums); + } + if (ret) + goto out; + +add_to_batch: + ins_sizes[dst_index] = btrfs_item_size(src, src_slot); + batch.total_data_size += ins_sizes[dst_index]; + batch.nr++; + dst_index++; + } + + /* + * We have a leaf full of old extent items that don't need to be logged, + * so we don't need to do anything. + */ + if (batch.nr == 0) + goto out; + + ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); + if (ret) + goto out; + + dst_index = 0; + for (i = 0; i < nr; i++) { + const int src_slot = start_slot + i; + const int dst_slot = dst_path->slots[0] + dst_index; + struct btrfs_key key; + unsigned long src_offset; + unsigned long dst_offset; + + /* + * We're done, all the remaining items in the source leaf + * correspond to old file extent items. + */ + if (dst_index >= batch.nr) + break; + + btrfs_item_key_to_cpu(src, &key, src_slot); + + if (key.type != BTRFS_EXTENT_DATA_KEY) + goto copy_item; + + extent = btrfs_item_ptr(src, src_slot, + struct btrfs_file_extent_item); + + /* See the comment in the previous loop, same logic. */ + if (btrfs_file_extent_generation(src, extent) < trans->transid && + key.offset < i_size && + inode->last_reflink_trans < trans->transid) + continue; + +copy_item: + dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot); + src_offset = btrfs_item_ptr_offset(src, src_slot); + + if (key.type == BTRFS_INODE_ITEM_KEY) { + struct btrfs_inode_item *inode_item; + + inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot, + struct btrfs_inode_item); + fill_inode_item(trans, dst_path->nodes[0], inode_item, + &inode->vfs_inode, + inode_only == LOG_INODE_EXISTS, + logged_isize); + } else { + copy_extent_buffer(dst_path->nodes[0], src, dst_offset, + src_offset, ins_sizes[dst_index]); + } + + dst_index++; + } + + btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]); + btrfs_release_path(dst_path); +out: + kfree(ins_data); + + return ret; +} + +static int extent_cmp(void *priv, const struct list_head *a, + const struct list_head *b) +{ + const struct extent_map *em1, *em2; + + em1 = list_entry(a, struct extent_map, list); + em2 = list_entry(b, struct extent_map, list); + + if (em1->start < em2->start) + return -1; + else if (em1->start > em2->start) + return 1; + return 0; +} + +static int log_extent_csums(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_root *log_root, + const struct extent_map *em, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_root *csum_root; + u64 csum_offset; + u64 csum_len; + u64 mod_start = em->mod_start; + u64 mod_len = em->mod_len; + LIST_HEAD(ordered_sums); + int ret = 0; + + if (inode->flags & BTRFS_INODE_NODATASUM || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + em->block_start == EXTENT_MAP_HOLE) + return 0; + + list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { + const u64 ordered_end = ordered->file_offset + ordered->num_bytes; + const u64 mod_end = mod_start + mod_len; + struct btrfs_ordered_sum *sums; + + if (mod_len == 0) + break; + + if (ordered_end <= mod_start) + continue; + if (mod_end <= ordered->file_offset) + break; + + /* + * We are going to copy all the csums on this ordered extent, so + * go ahead and adjust mod_start and mod_len in case this ordered + * extent has already been logged. + */ + if (ordered->file_offset > mod_start) { + if (ordered_end >= mod_end) + mod_len = ordered->file_offset - mod_start; + /* + * If we have this case + * + * |--------- logged extent ---------| + * |----- ordered extent ----| + * + * Just don't mess with mod_start and mod_len, we'll + * just end up logging more csums than we need and it + * will be ok. + */ + } else { + if (ordered_end < mod_end) { + mod_len = mod_end - ordered_end; + mod_start = ordered_end; + } else { + mod_len = 0; + } + } + + /* + * To keep us from looping for the above case of an ordered + * extent that falls inside of the logged extent. + */ + if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) + continue; + + list_for_each_entry(sums, &ordered->list, list) { + ret = log_csums(trans, inode, log_root, sums); + if (ret) + return ret; + } + } + + /* We're done, found all csums in the ordered extents. */ + if (mod_len == 0) + return 0; + + /* If we're compressed we have to save the entire range of csums. */ + if (em->compress_type) { + csum_offset = 0; + csum_len = max(em->block_len, em->orig_block_len); + } else { + csum_offset = mod_start - em->start; + csum_len = mod_len; + } + + /* block start is already adjusted for the file extent offset. */ + csum_root = btrfs_csum_root(trans->fs_info, em->block_start); + ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset, + em->block_start + csum_offset + + csum_len - 1, &ordered_sums, 0, false); + if (ret) + return ret; + + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + if (!ret) + ret = log_csums(trans, inode, log_root, sums); + list_del(&sums->list); + kfree(sums); + } + + return ret; +} + +static int log_one_extent(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + const struct extent_map *em, + struct btrfs_path *path, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_drop_extents_args drop_args = { 0 }; + struct btrfs_root *log = inode->root->log_root; + struct btrfs_file_extent_item fi = { 0 }; + struct extent_buffer *leaf; + struct btrfs_key key; + u64 extent_offset = em->start - em->orig_start; + u64 block_len; + int ret; + + btrfs_set_stack_file_extent_generation(&fi, trans->transid); + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); + else + btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); + + block_len = max(em->block_len, em->orig_block_len); + if (em->compress_type != BTRFS_COMPRESS_NONE) { + btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); + btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); + } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { + btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start - + extent_offset); + btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); + } + + btrfs_set_stack_file_extent_offset(&fi, extent_offset); + btrfs_set_stack_file_extent_num_bytes(&fi, em->len); + btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); + btrfs_set_stack_file_extent_compression(&fi, em->compress_type); + + ret = log_extent_csums(trans, inode, log, em, ctx); + if (ret) + return ret; + + /* + * If this is the first time we are logging the inode in the current + * transaction, we can avoid btrfs_drop_extents(), which is expensive + * because it does a deletion search, which always acquires write locks + * for extent buffers at levels 2, 1 and 0. This not only wastes time + * but also adds significant contention in a log tree, since log trees + * are small, with a root at level 2 or 3 at most, due to their short + * life span. + */ + if (ctx->logged_before) { + drop_args.path = path; + drop_args.start = em->start; + drop_args.end = em->start + em->len; + drop_args.replace_extent = true; + drop_args.extent_item_size = sizeof(fi); + ret = btrfs_drop_extents(trans, log, inode, &drop_args); + if (ret) + return ret; + } + + if (!drop_args.extent_inserted) { + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = em->start; + + ret = btrfs_insert_empty_item(trans, log, path, &key, + sizeof(fi)); + if (ret) + return ret; + } + leaf = path->nodes[0]; + write_extent_buffer(leaf, &fi, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(fi)); + btrfs_mark_buffer_dirty(trans, leaf); + + btrfs_release_path(path); + + return ret; +} + +/* + * Log all prealloc extents beyond the inode's i_size to make sure we do not + * lose them after doing a full/fast fsync and replaying the log. We scan the + * subvolume's root instead of iterating the inode's extent map tree because + * otherwise we can log incorrect extent items based on extent map conversion. + * That can happen due to the fact that extent maps are merged when they + * are not in the extent map tree's list of modified extents. + */ +static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path) +{ + struct btrfs_root *root = inode->root; + struct btrfs_key key; + const u64 i_size = i_size_read(&inode->vfs_inode); + const u64 ino = btrfs_ino(inode); + struct btrfs_path *dst_path = NULL; + bool dropped_extents = false; + u64 truncate_offset = i_size; + struct extent_buffer *leaf; + int slot; + int ins_nr = 0; + int start_slot = 0; + int ret; + + if (!(inode->flags & BTRFS_INODE_PREALLOC)) + return 0; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = i_size; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + /* + * We must check if there is a prealloc extent that starts before the + * i_size and crosses the i_size boundary. This is to ensure later we + * truncate down to the end of that extent and not to the i_size, as + * otherwise we end up losing part of the prealloc extent after a log + * replay and with an implicit hole if there is another prealloc extent + * that starts at an offset beyond i_size. + */ + ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); + if (ret < 0) + goto out; + + if (ret == 0) { + struct btrfs_file_extent_item *ei; + + leaf = path->nodes[0]; + slot = path->slots[0]; + ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, ei) == + BTRFS_FILE_EXTENT_PREALLOC) { + u64 extent_end; + + btrfs_item_key_to_cpu(leaf, &key, slot); + extent_end = key.offset + + btrfs_file_extent_num_bytes(leaf, ei); + + if (extent_end > i_size) + truncate_offset = extent_end; + } + } else { + ret = 0; + } + + while (true) { + leaf = path->nodes[0]; + slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + if (ins_nr > 0) { + ret = copy_items(trans, inode, dst_path, path, + start_slot, ins_nr, 1, 0); + if (ret < 0) + goto out; + ins_nr = 0; + } + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + break; + } + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid > ino) + break; + if (WARN_ON_ONCE(key.objectid < ino) || + key.type < BTRFS_EXTENT_DATA_KEY || + key.offset < i_size) { + path->slots[0]++; + continue; + } + if (!dropped_extents) { + /* + * Avoid logging extent items logged in past fsync calls + * and leading to duplicate keys in the log tree. + */ + ret = truncate_inode_items(trans, root->log_root, inode, + truncate_offset, + BTRFS_EXTENT_DATA_KEY); + if (ret) + goto out; + dropped_extents = true; + } + if (ins_nr == 0) + start_slot = slot; + ins_nr++; + path->slots[0]++; + if (!dst_path) { + dst_path = btrfs_alloc_path(); + if (!dst_path) { + ret = -ENOMEM; + goto out; + } + } + } + if (ins_nr > 0) + ret = copy_items(trans, inode, dst_path, path, + start_slot, ins_nr, 1, 0); +out: + btrfs_release_path(path); + btrfs_free_path(dst_path); + return ret; +} + +static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; + struct extent_map *em, *n; + LIST_HEAD(extents); + struct extent_map_tree *tree = &inode->extent_tree; + int ret = 0; + int num = 0; + + write_lock(&tree->lock); + + list_for_each_entry_safe(em, n, &tree->modified_extents, list) { + list_del_init(&em->list); + /* + * Just an arbitrary number, this can be really CPU intensive + * once we start getting a lot of extents, and really once we + * have a bunch of extents we just want to commit since it will + * be faster. + */ + if (++num > 32768) { + list_del_init(&tree->modified_extents); + ret = -EFBIG; + goto process; + } + + if (em->generation < trans->transid) + continue; + + /* We log prealloc extents beyond eof later. */ + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && + em->start >= i_size_read(&inode->vfs_inode)) + continue; + + /* Need a ref to keep it from getting evicted from cache */ + refcount_inc(&em->refs); + set_bit(EXTENT_FLAG_LOGGING, &em->flags); + list_add_tail(&em->list, &extents); + num++; + } + + list_sort(NULL, &extents, extent_cmp); +process: + while (!list_empty(&extents)) { + em = list_entry(extents.next, struct extent_map, list); + + list_del_init(&em->list); + + /* + * If we had an error we just need to delete everybody from our + * private list. + */ + if (ret) { + clear_em_logging(tree, em); + free_extent_map(em); + continue; + } + + write_unlock(&tree->lock); + + ret = log_one_extent(trans, inode, em, path, ctx); + write_lock(&tree->lock); + clear_em_logging(tree, em); + free_extent_map(em); + } + WARN_ON(!list_empty(&extents)); + write_unlock(&tree->lock); + + if (!ret) + ret = btrfs_log_prealloc_extents(trans, inode, path); + if (ret) + return ret; + + /* + * We have logged all extents successfully, now make sure the commit of + * the current transaction waits for the ordered extents to complete + * before it commits and wipes out the log trees, otherwise we would + * lose data if an ordered extents completes after the transaction + * commits and a power failure happens after the transaction commit. + */ + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); + + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + spin_lock_irq(&inode->ordered_tree.lock); + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); + atomic_inc(&trans->transaction->pending_ordered); + } + spin_unlock_irq(&inode->ordered_tree.lock); + } + btrfs_put_ordered_extent(ordered); + } + + return 0; +} + +static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, + struct btrfs_path *path, u64 *size_ret) +{ + struct btrfs_key key; + int ret; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + *size_ret = 0; + } else { + struct btrfs_inode_item *item; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + *size_ret = btrfs_inode_size(path->nodes[0], item); + /* + * If the in-memory inode's i_size is smaller then the inode + * size stored in the btree, return the inode's i_size, so + * that we get a correct inode size after replaying the log + * when before a power failure we had a shrinking truncate + * followed by addition of a new name (rename / new hard link). + * Otherwise return the inode size from the btree, to avoid + * data loss when replaying a log due to previously doing a + * write that expands the inode's size and logging a new name + * immediately after. + */ + if (*size_ret > inode->vfs_inode.i_size) + *size_ret = inode->vfs_inode.i_size; + } + + btrfs_release_path(path); + return 0; +} + +/* + * At the moment we always log all xattrs. This is to figure out at log replay + * time which xattrs must have their deletion replayed. If a xattr is missing + * in the log tree and exists in the fs/subvol tree, we delete it. This is + * because if a xattr is deleted, the inode is fsynced and a power failure + * happens, causing the log to be replayed the next time the fs is mounted, + * we want the xattr to not exist anymore (same behaviour as other filesystems + * with a journal, ext3/4, xfs, f2fs, etc). + */ +static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + struct btrfs_root *root = inode->root; + int ret; + struct btrfs_key key; + const u64 ino = btrfs_ino(inode); + int ins_nr = 0; + int start_slot = 0; + bool found_xattrs = false; + + if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) + return 0; + + key.objectid = ino; + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + while (true) { + int slot = path->slots[0]; + struct extent_buffer *leaf = path->nodes[0]; + int nritems = btrfs_header_nritems(leaf); + + if (slot >= nritems) { + if (ins_nr > 0) { + ret = copy_items(trans, inode, dst_path, path, + start_slot, ins_nr, 1, 0); + if (ret < 0) + return ret; + ins_nr = 0; + } + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) + break; + + if (ins_nr == 0) + start_slot = slot; + ins_nr++; + path->slots[0]++; + found_xattrs = true; + cond_resched(); + } + if (ins_nr > 0) { + ret = copy_items(trans, inode, dst_path, path, + start_slot, ins_nr, 1, 0); + if (ret < 0) + return ret; + } + + if (!found_xattrs) + set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); + + return 0; +} + +/* + * When using the NO_HOLES feature if we punched a hole that causes the + * deletion of entire leafs or all the extent items of the first leaf (the one + * that contains the inode item and references) we may end up not processing + * any extents, because there are no leafs with a generation matching the + * current transaction that have extent items for our inode. So we need to find + * if any holes exist and then log them. We also need to log holes after any + * truncate operation that changes the inode's size. + */ +static int btrfs_log_holes(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key key; + const u64 ino = btrfs_ino(inode); + const u64 i_size = i_size_read(&inode->vfs_inode); + u64 prev_extent_end = 0; + int ret; + + if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) + return 0; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + if (ret > 0) { + ret = 0; + break; + } + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + break; + + /* We have a hole, log it. */ + if (prev_extent_end < key.offset) { + const u64 hole_len = key.offset - prev_extent_end; + + /* + * Release the path to avoid deadlocks with other code + * paths that search the root while holding locks on + * leafs from the log root. + */ + btrfs_release_path(path); + ret = btrfs_insert_hole_extent(trans, root->log_root, + ino, prev_extent_end, + hole_len); + if (ret < 0) + return ret; + + /* + * Search for the same key again in the root. Since it's + * an extent item and we are holding the inode lock, the + * key must still exist. If it doesn't just emit warning + * and return an error to fall back to a transaction + * commit. + */ + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + if (WARN_ON(ret > 0)) + return -ENOENT; + leaf = path->nodes[0]; + } + + prev_extent_end = btrfs_file_extent_end(path); + path->slots[0]++; + cond_resched(); + } + + if (prev_extent_end < i_size) { + u64 hole_len; + + btrfs_release_path(path); + hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); + ret = btrfs_insert_hole_extent(trans, root->log_root, ino, + prev_extent_end, hole_len); + if (ret < 0) + return ret; + } + + return 0; +} + +/* + * When we are logging a new inode X, check if it doesn't have a reference that + * matches the reference from some other inode Y created in a past transaction + * and that was renamed in the current transaction. If we don't do this, then at + * log replay time we can lose inode Y (and all its files if it's a directory): + * + * mkdir /mnt/x + * echo "hello world" > /mnt/x/foobar + * sync + * mv /mnt/x /mnt/y + * mkdir /mnt/x # or touch /mnt/x + * xfs_io -c fsync /mnt/x + * + * mount fs, trigger log replay + * + * After the log replay procedure, we would lose the first directory and all its + * files (file foobar). + * For the case where inode Y is not a directory we simply end up losing it: + * + * echo "123" > /mnt/foo + * sync + * mv /mnt/foo /mnt/bar + * echo "abc" > /mnt/foo + * xfs_io -c fsync /mnt/foo + * + * + * We also need this for cases where a snapshot entry is replaced by some other + * entry (file or directory) otherwise we end up with an unreplayable log due to + * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as + * if it were a regular entry: + * + * mkdir /mnt/x + * btrfs subvolume snapshot /mnt /mnt/x/snap + * btrfs subvolume delete /mnt/x/snap + * rmdir /mnt/x + * mkdir /mnt/x + * fsync /mnt/x or fsync some new file inside it + * + * + * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in + * the same transaction. + */ +static int btrfs_check_ref_name_override(struct extent_buffer *eb, + const int slot, + const struct btrfs_key *key, + struct btrfs_inode *inode, + u64 *other_ino, u64 *other_parent) +{ + int ret; + struct btrfs_path *search_path; + char *name = NULL; + u32 name_len = 0; + u32 item_size = btrfs_item_size(eb, slot); + u32 cur_offset = 0; + unsigned long ptr = btrfs_item_ptr_offset(eb, slot); + + search_path = btrfs_alloc_path(); + if (!search_path) + return -ENOMEM; + search_path->search_commit_root = 1; + search_path->skip_locking = 1; + + while (cur_offset < item_size) { + u64 parent; + u32 this_name_len; + u32 this_len; + unsigned long name_ptr; + struct btrfs_dir_item *di; + struct fscrypt_str name_str; + + if (key->type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *iref; + + iref = (struct btrfs_inode_ref *)(ptr + cur_offset); + parent = key->offset; + this_name_len = btrfs_inode_ref_name_len(eb, iref); + name_ptr = (unsigned long)(iref + 1); + this_len = sizeof(*iref) + this_name_len; + } else { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *)(ptr + + cur_offset); + parent = btrfs_inode_extref_parent(eb, extref); + this_name_len = btrfs_inode_extref_name_len(eb, extref); + name_ptr = (unsigned long)&extref->name; + this_len = sizeof(*extref) + this_name_len; + } + + if (this_name_len > name_len) { + char *new_name; + + new_name = krealloc(name, this_name_len, GFP_NOFS); + if (!new_name) { + ret = -ENOMEM; + goto out; + } + name_len = this_name_len; + name = new_name; + } + + read_extent_buffer(eb, name, name_ptr, this_name_len); + + name_str.name = name; + name_str.len = this_name_len; + di = btrfs_lookup_dir_item(NULL, inode->root, search_path, + parent, &name_str, 0); + if (di && !IS_ERR(di)) { + struct btrfs_key di_key; + + btrfs_dir_item_key_to_cpu(search_path->nodes[0], + di, &di_key); + if (di_key.type == BTRFS_INODE_ITEM_KEY) { + if (di_key.objectid != key->objectid) { + ret = 1; + *other_ino = di_key.objectid; + *other_parent = parent; + } else { + ret = 0; + } + } else { + ret = -EAGAIN; + } + goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + btrfs_release_path(search_path); + + cur_offset += this_len; + } + ret = 0; +out: + btrfs_free_path(search_path); + kfree(name); + return ret; +} + +/* + * Check if we need to log an inode. This is used in contexts where while + * logging an inode we need to log another inode (either that it exists or in + * full mode). This is used instead of btrfs_inode_in_log() because the later + * requires the inode to be in the log and have the log transaction committed, + * while here we do not care if the log transaction was already committed - our + * caller will commit the log later - and we want to avoid logging an inode + * multiple times when multiple tasks have joined the same log transaction. + */ +static bool need_log_inode(const struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) +{ + /* + * If a directory was not modified, no dentries added or removed, we can + * and should avoid logging it. + */ + if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) + return false; + + /* + * If this inode does not have new/updated/deleted xattrs since the last + * time it was logged and is flagged as logged in the current transaction, + * we can skip logging it. As for new/deleted names, those are updated in + * the log by link/unlink/rename operations. + * In case the inode was logged and then evicted and reloaded, its + * logged_trans will be 0, in which case we have to fully log it since + * logged_trans is a transient field, not persisted. + */ + if (inode_logged(trans, inode, NULL) == 1 && + !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) + return false; + + return true; +} + +struct btrfs_dir_list { + u64 ino; + struct list_head list; +}; + +/* + * Log the inodes of the new dentries of a directory. + * See process_dir_items_leaf() for details about why it is needed. + * This is a recursive operation - if an existing dentry corresponds to a + * directory, that directory's new entries are logged too (same behaviour as + * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes + * the dentries point to we do not acquire their VFS lock, otherwise lockdep + * complains about the following circular lock dependency / possible deadlock: + * + * CPU0 CPU1 + * ---- ---- + * lock(&type->i_mutex_dir_key#3/2); + * lock(sb_internal#2); + * lock(&type->i_mutex_dir_key#3/2); + * lock(&sb->s_type->i_mutex_key#14); + * + * Where sb_internal is the lock (a counter that works as a lock) acquired by + * sb_start_intwrite() in btrfs_start_transaction(). + * Not acquiring the VFS lock of the inodes is still safe because: + * + * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible + * that while logging the inode new references (names) are added or removed + * from the inode, leaving the logged inode item with a link count that does + * not match the number of logged inode reference items. This is fine because + * at log replay time we compute the real number of links and correct the + * link count in the inode item (see replay_one_buffer() and + * link_to_fixup_dir()); + * + * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that + * while logging the inode's items new index items (key type + * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item + * has a size that doesn't match the sum of the lengths of all the logged + * names - this is ok, not a problem, because at log replay time we set the + * directory's i_size to the correct value (see replay_one_name() and + * overwrite_item()). + */ +static int log_new_dir_dentries(struct btrfs_trans_handle *trans, + struct btrfs_inode *start_inode, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *root = start_inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + LIST_HEAD(dir_list); + struct btrfs_dir_list *dir_elem; + u64 ino = btrfs_ino(start_inode); + struct btrfs_inode *curr_inode = start_inode; + int ret = 0; + + /* + * If we are logging a new name, as part of a link or rename operation, + * don't bother logging new dentries, as we just want to log the names + * of an inode and that any new parents exist. + */ + if (ctx->logging_new_name) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* Pairs with btrfs_add_delayed_iput below. */ + ihold(&curr_inode->vfs_inode); + + while (true) { + struct inode *vfs_inode; + struct btrfs_key key; + struct btrfs_key found_key; + u64 next_index; + bool continue_curr_inode = true; + int iter_ret; + + key.objectid = ino; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = btrfs_get_first_dir_index_to_log(curr_inode); + next_index = key.offset; +again: + btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) { + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_dir_item *di; + struct btrfs_key di_key; + struct inode *di_inode; + int log_mode = LOG_INODE_EXISTS; + int type; + + if (found_key.objectid != ino || + found_key.type != BTRFS_DIR_INDEX_KEY) { + continue_curr_inode = false; + break; + } + + next_index = found_key.offset + 1; + + di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); + type = btrfs_dir_ftype(leaf, di); + if (btrfs_dir_transid(leaf, di) < trans->transid) + continue; + btrfs_dir_item_key_to_cpu(leaf, di, &di_key); + if (di_key.type == BTRFS_ROOT_ITEM_KEY) + continue; + + btrfs_release_path(path); + di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); + if (IS_ERR(di_inode)) { + ret = PTR_ERR(di_inode); + goto out; + } + + if (!need_log_inode(trans, BTRFS_I(di_inode))) { + btrfs_add_delayed_iput(BTRFS_I(di_inode)); + break; + } + + ctx->log_new_dentries = false; + if (type == BTRFS_FT_DIR) + log_mode = LOG_INODE_ALL; + ret = btrfs_log_inode(trans, BTRFS_I(di_inode), + log_mode, ctx); + btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (ret) + goto out; + if (ctx->log_new_dentries) { + dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); + if (!dir_elem) { + ret = -ENOMEM; + goto out; + } + dir_elem->ino = di_key.objectid; + list_add_tail(&dir_elem->list, &dir_list); + } + break; + } + + btrfs_release_path(path); + + if (iter_ret < 0) { + ret = iter_ret; + goto out; + } else if (iter_ret > 0) { + continue_curr_inode = false; + } else { + key = found_key; + } + + if (continue_curr_inode && key.offset < (u64)-1) { + key.offset++; + goto again; + } + + btrfs_set_first_dir_index_to_log(curr_inode, next_index); + + if (list_empty(&dir_list)) + break; + + dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list); + ino = dir_elem->ino; + list_del(&dir_elem->list); + kfree(dir_elem); + + btrfs_add_delayed_iput(curr_inode); + curr_inode = NULL; + + vfs_inode = btrfs_iget(fs_info->sb, ino, root); + if (IS_ERR(vfs_inode)) { + ret = PTR_ERR(vfs_inode); + break; + } + curr_inode = BTRFS_I(vfs_inode); + } +out: + btrfs_free_path(path); + if (curr_inode) + btrfs_add_delayed_iput(curr_inode); + + if (ret) { + struct btrfs_dir_list *next; + + list_for_each_entry_safe(dir_elem, next, &dir_list, list) + kfree(dir_elem); + } + + return ret; +} + +struct btrfs_ino_list { + u64 ino; + u64 parent; + struct list_head list; +}; + +static void free_conflicting_inodes(struct btrfs_log_ctx *ctx) +{ + struct btrfs_ino_list *curr; + struct btrfs_ino_list *next; + + list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) { + list_del(&curr->list); + kfree(curr); + } +} + +static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino, + struct btrfs_path *path) +{ + struct btrfs_key key; + int ret; + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + path->search_commit_root = 1; + path->skip_locking = 1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (WARN_ON_ONCE(ret > 0)) { + /* + * We have previously found the inode through the commit root + * so this should not happen. If it does, just error out and + * fallback to a transaction commit. + */ + ret = -ENOENT; + } else if (ret == 0) { + struct btrfs_inode_item *item; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item))) + ret = 1; + } + + btrfs_release_path(path); + path->search_commit_root = 0; + path->skip_locking = 0; + + return ret; +} + +static int add_conflicting_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 ino, u64 parent, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_ino_list *ino_elem; + struct inode *inode; + + /* + * It's rare to have a lot of conflicting inodes, in practice it is not + * common to have more than 1 or 2. We don't want to collect too many, + * as we could end up logging too many inodes (even if only in + * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction + * commits. + */ + if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) + return BTRFS_LOG_FORCE_COMMIT; + + inode = btrfs_iget(root->fs_info->sb, ino, root); + /* + * If the other inode that had a conflicting dir entry was deleted in + * the current transaction then we either: + * + * 1) Log the parent directory (later after adding it to the list) if + * the inode is a directory. This is because it may be a deleted + * subvolume/snapshot or it may be a regular directory that had + * deleted subvolumes/snapshots (or subdirectories that had them), + * and at the moment we can't deal with dropping subvolumes/snapshots + * during log replay. So we just log the parent, which will result in + * a fallback to a transaction commit if we are dealing with those + * cases (last_unlink_trans will match the current transaction); + * + * 2) Do nothing if it's not a directory. During log replay we simply + * unlink the conflicting dentry from the parent directory and then + * add the dentry for our inode. Like this we can avoid logging the + * parent directory (and maybe fallback to a transaction commit in + * case it has a last_unlink_trans == trans->transid, due to moving + * some inode from it to some other directory). + */ + if (IS_ERR(inode)) { + int ret = PTR_ERR(inode); + + if (ret != -ENOENT) + return ret; + + ret = conflicting_inode_is_dir(root, ino, path); + /* Not a directory or we got an error. */ + if (ret <= 0) + return ret; + + /* Conflicting inode is a directory, so we'll log its parent. */ + ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); + if (!ino_elem) + return -ENOMEM; + ino_elem->ino = ino; + ino_elem->parent = parent; + list_add_tail(&ino_elem->list, &ctx->conflict_inodes); + ctx->num_conflict_inodes++; + + return 0; + } + + /* + * If the inode was already logged skip it - otherwise we can hit an + * infinite loop. Example: + * + * From the commit root (previous transaction) we have the following + * inodes: + * + * inode 257 a directory + * inode 258 with references "zz" and "zz_link" on inode 257 + * inode 259 with reference "a" on inode 257 + * + * And in the current (uncommitted) transaction we have: + * + * inode 257 a directory, unchanged + * inode 258 with references "a" and "a2" on inode 257 + * inode 259 with reference "zz_link" on inode 257 + * inode 261 with reference "zz" on inode 257 + * + * When logging inode 261 the following infinite loop could + * happen if we don't skip already logged inodes: + * + * - we detect inode 258 as a conflicting inode, with inode 261 + * on reference "zz", and log it; + * + * - we detect inode 259 as a conflicting inode, with inode 258 + * on reference "a", and log it; + * + * - we detect inode 258 as a conflicting inode, with inode 259 + * on reference "zz_link", and log it - again! After this we + * repeat the above steps forever. + * + * Here we can use need_log_inode() because we only need to log the + * inode in LOG_INODE_EXISTS mode and rename operations update the log, + * so that the log ends up with the new name and without the old name. + */ + if (!need_log_inode(trans, BTRFS_I(inode))) { + btrfs_add_delayed_iput(BTRFS_I(inode)); + return 0; + } + + btrfs_add_delayed_iput(BTRFS_I(inode)); + + ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); + if (!ino_elem) + return -ENOMEM; + ino_elem->ino = ino; + ino_elem->parent = parent; + list_add_tail(&ino_elem->list, &ctx->conflict_inodes); + ctx->num_conflict_inodes++; + + return 0; +} + +static int log_conflicting_inodes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; + + /* + * Conflicting inodes are logged by the first call to btrfs_log_inode(), + * otherwise we could have unbounded recursion of btrfs_log_inode() + * calls. This check guarantees we can have only 1 level of recursion. + */ + if (ctx->logging_conflict_inodes) + return 0; + + ctx->logging_conflict_inodes = true; + + /* + * New conflicting inodes may be found and added to the list while we + * are logging a conflicting inode, so keep iterating while the list is + * not empty. + */ + while (!list_empty(&ctx->conflict_inodes)) { + struct btrfs_ino_list *curr; + struct inode *inode; + u64 ino; + u64 parent; + + curr = list_first_entry(&ctx->conflict_inodes, + struct btrfs_ino_list, list); + ino = curr->ino; + parent = curr->parent; + list_del(&curr->list); + kfree(curr); + + inode = btrfs_iget(fs_info->sb, ino, root); + /* + * If the other inode that had a conflicting dir entry was + * deleted in the current transaction, we need to log its parent + * directory. See the comment at add_conflicting_inode(). + */ + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + if (ret != -ENOENT) + break; + + inode = btrfs_iget(fs_info->sb, parent, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + break; + } + + /* + * Always log the directory, we cannot make this + * conditional on need_log_inode() because the directory + * might have been logged in LOG_INODE_EXISTS mode or + * the dir index of the conflicting inode is not in a + * dir index key range logged for the directory. So we + * must make sure the deletion is recorded. + */ + ret = btrfs_log_inode(trans, BTRFS_I(inode), + LOG_INODE_ALL, ctx); + btrfs_add_delayed_iput(BTRFS_I(inode)); + if (ret) + break; + continue; + } + + /* + * Here we can use need_log_inode() because we only need to log + * the inode in LOG_INODE_EXISTS mode and rename operations + * update the log, so that the log ends up with the new name and + * without the old name. + * + * We did this check at add_conflicting_inode(), but here we do + * it again because if some other task logged the inode after + * that, we can avoid doing it again. + */ + if (!need_log_inode(trans, BTRFS_I(inode))) { + btrfs_add_delayed_iput(BTRFS_I(inode)); + continue; + } + + /* + * We are safe logging the other inode without acquiring its + * lock as long as we log with the LOG_INODE_EXISTS mode. We + * are safe against concurrent renames of the other inode as + * well because during a rename we pin the log and update the + * log with the new name before we unpin it. + */ + ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(BTRFS_I(inode)); + if (ret) + break; + } + + ctx->logging_conflict_inodes = false; + if (ret) + free_conflicting_inodes(ctx); + + return ret; +} + +static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_key *min_key, + const struct btrfs_key *max_key, + struct btrfs_path *path, + struct btrfs_path *dst_path, + const u64 logged_isize, + const int inode_only, + struct btrfs_log_ctx *ctx, + bool *need_log_inode_item) +{ + const u64 i_size = i_size_read(&inode->vfs_inode); + struct btrfs_root *root = inode->root; + int ins_start_slot = 0; + int ins_nr = 0; + int ret; + + while (1) { + ret = btrfs_search_forward(root, min_key, path, trans->transid); + if (ret < 0) + return ret; + if (ret > 0) { + ret = 0; + break; + } +again: + /* Note, ins_nr might be > 0 here, cleanup outside the loop */ + if (min_key->objectid != max_key->objectid) + break; + if (min_key->type > max_key->type) + break; + + if (min_key->type == BTRFS_INODE_ITEM_KEY) { + *need_log_inode_item = false; + } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && + min_key->offset >= i_size) { + /* + * Extents at and beyond eof are logged with + * btrfs_log_prealloc_extents(). + * Only regular files have BTRFS_EXTENT_DATA_KEY keys, + * and no keys greater than that, so bail out. + */ + break; + } else if ((min_key->type == BTRFS_INODE_REF_KEY || + min_key->type == BTRFS_INODE_EXTREF_KEY) && + (inode->generation == trans->transid || + ctx->logging_conflict_inodes)) { + u64 other_ino = 0; + u64 other_parent = 0; + + ret = btrfs_check_ref_name_override(path->nodes[0], + path->slots[0], min_key, inode, + &other_ino, &other_parent); + if (ret < 0) { + return ret; + } else if (ret > 0 && + other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { + if (ins_nr > 0) { + ins_nr++; + } else { + ins_nr = 1; + ins_start_slot = path->slots[0]; + } + ret = copy_items(trans, inode, dst_path, path, + ins_start_slot, ins_nr, + inode_only, logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + + btrfs_release_path(path); + ret = add_conflicting_inode(trans, root, path, + other_ino, + other_parent, ctx); + if (ret) + return ret; + goto next_key; + } + } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ + if (ins_nr == 0) + goto next_slot; + ret = copy_items(trans, inode, dst_path, path, + ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + goto next_slot; + } + + if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { + ins_nr++; + goto next_slot; + } else if (!ins_nr) { + ins_start_slot = path->slots[0]; + ins_nr = 1; + goto next_slot; + } + + ret = copy_items(trans, inode, dst_path, path, ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) + return ret; + ins_nr = 1; + ins_start_slot = path->slots[0]; +next_slot: + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key_to_cpu(path->nodes[0], min_key, + path->slots[0]); + goto again; + } + if (ins_nr) { + ret = copy_items(trans, inode, dst_path, path, + ins_start_slot, ins_nr, inode_only, + logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + } + btrfs_release_path(path); +next_key: + if (min_key->offset < (u64)-1) { + min_key->offset++; + } else if (min_key->type < max_key->type) { + min_key->type++; + min_key->offset = 0; + } else { + break; + } + + /* + * We may process many leaves full of items for our inode, so + * avoid monopolizing a cpu for too long by rescheduling while + * not holding locks on any tree. + */ + cond_resched(); + } + if (ins_nr) { + ret = copy_items(trans, inode, dst_path, path, ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret) + return ret; + } + + if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { + /* + * Release the path because otherwise we might attempt to double + * lock the same leaf with btrfs_log_prealloc_extents() below. + */ + btrfs_release_path(path); + ret = btrfs_log_prealloc_extents(trans, inode, dst_path); + } + + return ret; +} + +static int insert_delayed_items_batch(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + const struct btrfs_item_batch *batch, + const struct btrfs_delayed_item *first_item) +{ + const struct btrfs_delayed_item *curr = first_item; + int ret; + + ret = btrfs_insert_empty_items(trans, log, path, batch); + if (ret) + return ret; + + for (int i = 0; i < batch->nr; i++) { + char *data_ptr; + + data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); + write_extent_buffer(path->nodes[0], &curr->data, + (unsigned long)data_ptr, curr->data_len); + curr = list_next_entry(curr, log_list); + path->slots[0]++; + } + + btrfs_release_path(path); + + return 0; +} + +static int log_delayed_insertion_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + const struct list_head *delayed_ins_list, + struct btrfs_log_ctx *ctx) +{ + /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */ + const int max_batch_size = 195; + const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info); + const u64 ino = btrfs_ino(inode); + struct btrfs_root *log = inode->root->log_root; + struct btrfs_item_batch batch = { + .nr = 0, + .total_data_size = 0, + }; + const struct btrfs_delayed_item *first = NULL; + const struct btrfs_delayed_item *curr; + char *ins_data; + struct btrfs_key *ins_keys; + u32 *ins_sizes; + u64 curr_batch_size = 0; + int batch_idx = 0; + int ret; + + /* We are adding dir index items to the log tree. */ + lockdep_assert_held(&inode->log_mutex); + + /* + * We collect delayed items before copying index keys from the subvolume + * to the log tree. However just after we collected them, they may have + * been flushed (all of them or just some of them), and therefore we + * could have copied them from the subvolume tree to the log tree. + * So find the first delayed item that was not yet logged (they are + * sorted by index number). + */ + list_for_each_entry(curr, delayed_ins_list, log_list) { + if (curr->index > inode->last_dir_index_offset) { + first = curr; + break; + } + } + + /* Empty list or all delayed items were already logged. */ + if (!first) + return 0; + + ins_data = kmalloc(max_batch_size * sizeof(u32) + + max_batch_size * sizeof(struct btrfs_key), GFP_NOFS); + if (!ins_data) + return -ENOMEM; + ins_sizes = (u32 *)ins_data; + batch.data_sizes = ins_sizes; + ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32)); + batch.keys = ins_keys; + + curr = first; + while (!list_entry_is_head(curr, delayed_ins_list, log_list)) { + const u32 curr_size = curr->data_len + sizeof(struct btrfs_item); + + if (curr_batch_size + curr_size > leaf_data_size || + batch.nr == max_batch_size) { + ret = insert_delayed_items_batch(trans, log, path, + &batch, first); + if (ret) + goto out; + batch_idx = 0; + batch.nr = 0; + batch.total_data_size = 0; + curr_batch_size = 0; + first = curr; + } + + ins_sizes[batch_idx] = curr->data_len; + ins_keys[batch_idx].objectid = ino; + ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY; + ins_keys[batch_idx].offset = curr->index; + curr_batch_size += curr_size; + batch.total_data_size += curr->data_len; + batch.nr++; + batch_idx++; + curr = list_next_entry(curr, log_list); + } + + ASSERT(batch.nr >= 1); + ret = insert_delayed_items_batch(trans, log, path, &batch, first); + + curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item, + log_list); + inode->last_dir_index_offset = curr->index; +out: + kfree(ins_data); + + return ret; +} + +static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + const struct list_head *delayed_del_list, + struct btrfs_log_ctx *ctx) +{ + const u64 ino = btrfs_ino(inode); + const struct btrfs_delayed_item *curr; + + curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, + log_list); + + while (!list_entry_is_head(curr, delayed_del_list, log_list)) { + u64 first_dir_index = curr->index; + u64 last_dir_index; + const struct btrfs_delayed_item *next; + int ret; + + /* + * Find a range of consecutive dir index items to delete. Like + * this we log a single dir range item spanning several contiguous + * dir items instead of logging one range item per dir index item. + */ + next = list_next_entry(curr, log_list); + while (!list_entry_is_head(next, delayed_del_list, log_list)) { + if (next->index != curr->index + 1) + break; + curr = next; + next = list_next_entry(next, log_list); + } + + last_dir_index = curr->index; + ASSERT(last_dir_index >= first_dir_index); + + ret = insert_dir_log_key(trans, inode->root->log_root, path, + ino, first_dir_index, last_dir_index); + if (ret) + return ret; + curr = list_next_entry(curr, log_list); + } + + return 0; +} + +static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + struct btrfs_log_ctx *ctx, + const struct list_head *delayed_del_list, + const struct btrfs_delayed_item *first, + const struct btrfs_delayed_item **last_ret) +{ + const struct btrfs_delayed_item *next; + struct extent_buffer *leaf = path->nodes[0]; + const int last_slot = btrfs_header_nritems(leaf) - 1; + int slot = path->slots[0] + 1; + const u64 ino = btrfs_ino(inode); + + next = list_next_entry(first, log_list); + + while (slot < last_slot && + !list_entry_is_head(next, delayed_del_list, log_list)) { + struct btrfs_key key; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || + key.type != BTRFS_DIR_INDEX_KEY || + key.offset != next->index) + break; + + slot++; + *last_ret = next; + next = list_next_entry(next, log_list); + } + + return btrfs_del_items(trans, inode->root->log_root, path, + path->slots[0], slot - path->slots[0]); +} + +static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + const struct list_head *delayed_del_list, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *log = inode->root->log_root; + const struct btrfs_delayed_item *curr; + u64 last_range_start = 0; + u64 last_range_end = 0; + struct btrfs_key key; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_DIR_INDEX_KEY; + curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, + log_list); + + while (!list_entry_is_head(curr, delayed_del_list, log_list)) { + const struct btrfs_delayed_item *last = curr; + u64 first_dir_index = curr->index; + u64 last_dir_index; + bool deleted_items = false; + int ret; + + key.offset = curr->index; + ret = btrfs_search_slot(trans, log, &key, path, -1, 1); + if (ret < 0) { + return ret; + } else if (ret == 0) { + ret = batch_delete_dir_index_items(trans, inode, path, ctx, + delayed_del_list, curr, + &last); + if (ret) + return ret; + deleted_items = true; + } + + btrfs_release_path(path); + + /* + * If we deleted items from the leaf, it means we have a range + * item logging their range, so no need to add one or update an + * existing one. Otherwise we have to log a dir range item. + */ + if (deleted_items) + goto next_batch; + + last_dir_index = last->index; + ASSERT(last_dir_index >= first_dir_index); + /* + * If this range starts right after where the previous one ends, + * then we want to reuse the previous range item and change its + * end offset to the end of this range. This is just to minimize + * leaf space usage, by avoiding adding a new range item. + */ + if (last_range_end != 0 && first_dir_index == last_range_end + 1) + first_dir_index = last_range_start; + + ret = insert_dir_log_key(trans, log, path, key.objectid, + first_dir_index, last_dir_index); + if (ret) + return ret; + + last_range_start = first_dir_index; + last_range_end = last_dir_index; +next_batch: + curr = list_next_entry(last, log_list); + } + + return 0; +} + +static int log_delayed_deletion_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + const struct list_head *delayed_del_list, + struct btrfs_log_ctx *ctx) +{ + /* + * We are deleting dir index items from the log tree or adding range + * items to it. + */ + lockdep_assert_held(&inode->log_mutex); + + if (list_empty(delayed_del_list)) + return 0; + + if (ctx->logged_before) + return log_delayed_deletions_incremental(trans, inode, path, + delayed_del_list, ctx); + + return log_delayed_deletions_full(trans, inode, path, delayed_del_list, + ctx); +} + +/* + * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed + * items instead of the subvolume tree. + */ +static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + const struct list_head *delayed_ins_list, + struct btrfs_log_ctx *ctx) +{ + const bool orig_log_new_dentries = ctx->log_new_dentries; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_item *item; + int ret = 0; + + /* + * No need for the log mutex, plus to avoid potential deadlocks or + * lockdep annotations due to nesting of delayed inode mutexes and log + * mutexes. + */ + lockdep_assert_not_held(&inode->log_mutex); + + ASSERT(!ctx->logging_new_delayed_dentries); + ctx->logging_new_delayed_dentries = true; + + list_for_each_entry(item, delayed_ins_list, log_list) { + struct btrfs_dir_item *dir_item; + struct inode *di_inode; + struct btrfs_key key; + int log_mode = LOG_INODE_EXISTS; + + dir_item = (struct btrfs_dir_item *)item->data; + btrfs_disk_key_to_cpu(&key, &dir_item->location); + + if (key.type == BTRFS_ROOT_ITEM_KEY) + continue; + + di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root); + if (IS_ERR(di_inode)) { + ret = PTR_ERR(di_inode); + break; + } + + if (!need_log_inode(trans, BTRFS_I(di_inode))) { + btrfs_add_delayed_iput(BTRFS_I(di_inode)); + continue; + } + + if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR) + log_mode = LOG_INODE_ALL; + + ctx->log_new_dentries = false; + ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); + + if (!ret && ctx->log_new_dentries) + ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); + + btrfs_add_delayed_iput(BTRFS_I(di_inode)); + + if (ret) + break; + } + + ctx->log_new_dentries = orig_log_new_dentries; + ctx->logging_new_delayed_dentries = false; + + return ret; +} + +/* log a single inode in the tree log. + * At least one parent directory for this inode must exist in the tree + * or be logged already. + * + * Any items from this inode changed by the current transaction are copied + * to the log tree. An extra reference is taken on any extents in this + * file, allowing us to avoid a whole pile of corner cases around logging + * blocks that have been removed from the tree. + * + * See LOG_INODE_ALL and related defines for a description of what inode_only + * does. + * + * This handles both files and directories. + */ +static int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + int inode_only, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_path *path; + struct btrfs_path *dst_path; + struct btrfs_key min_key; + struct btrfs_key max_key; + struct btrfs_root *log = inode->root->log_root; + int ret; + bool fast_search = false; + u64 ino = btrfs_ino(inode); + struct extent_map_tree *em_tree = &inode->extent_tree; + u64 logged_isize = 0; + bool need_log_inode_item = true; + bool xattrs_logged = false; + bool inode_item_dropped = true; + bool full_dir_logging = false; + LIST_HEAD(delayed_ins_list); + LIST_HEAD(delayed_del_list); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + dst_path = btrfs_alloc_path(); + if (!dst_path) { + btrfs_free_path(path); + return -ENOMEM; + } + + min_key.objectid = ino; + min_key.type = BTRFS_INODE_ITEM_KEY; + min_key.offset = 0; + + max_key.objectid = ino; + + + /* today the code can only do partial logging of directories */ + if (S_ISDIR(inode->vfs_inode.i_mode) || + (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &inode->runtime_flags) && + inode_only >= LOG_INODE_EXISTS)) + max_key.type = BTRFS_XATTR_ITEM_KEY; + else + max_key.type = (u8)-1; + max_key.offset = (u64)-1; + + if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL) + full_dir_logging = true; + + /* + * If we are logging a directory while we are logging dentries of the + * delayed items of some other inode, then we need to flush the delayed + * items of this directory and not log the delayed items directly. This + * is to prevent more than one level of recursion into btrfs_log_inode() + * by having something like this: + * + * $ mkdir -p a/b/c/d/e/f/g/h/... + * $ xfs_io -c "fsync" a + * + * Where all directories in the path did not exist before and are + * created in the current transaction. + * So in such a case we directly log the delayed items of the main + * directory ("a") without flushing them first, while for each of its + * subdirectories we flush their delayed items before logging them. + * This prevents a potential unbounded recursion like this: + * + * btrfs_log_inode() + * log_new_delayed_dentries() + * btrfs_log_inode() + * log_new_delayed_dentries() + * btrfs_log_inode() + * log_new_delayed_dentries() + * (...) + * + * We have thresholds for the maximum number of delayed items to have in + * memory, and once they are hit, the items are flushed asynchronously. + * However the limit is quite high, so lets prevent deep levels of + * recursion to happen by limiting the maximum depth to be 1. + */ + if (full_dir_logging && ctx->logging_new_delayed_dentries) { + ret = btrfs_commit_inode_delayed_items(trans, inode); + if (ret) + goto out; + } + + mutex_lock(&inode->log_mutex); + + /* + * For symlinks, we must always log their content, which is stored in an + * inline extent, otherwise we could end up with an empty symlink after + * log replay, which is invalid on linux (symlink(2) returns -ENOENT if + * one attempts to create an empty symlink). + * We don't need to worry about flushing delalloc, because when we create + * the inline extent when the symlink is created (we never have delalloc + * for symlinks). + */ + if (S_ISLNK(inode->vfs_inode.i_mode)) + inode_only = LOG_INODE_ALL; + + /* + * Before logging the inode item, cache the value returned by + * inode_logged(), because after that we have the need to figure out if + * the inode was previously logged in this transaction. + */ + ret = inode_logged(trans, inode, path); + if (ret < 0) + goto out_unlock; + ctx->logged_before = (ret == 1); + ret = 0; + + /* + * This is for cases where logging a directory could result in losing a + * a file after replaying the log. For example, if we move a file from a + * directory A to a directory B, then fsync directory A, we have no way + * to known the file was moved from A to B, so logging just A would + * result in losing the file after a log replay. + */ + if (full_dir_logging && inode->last_unlink_trans >= trans->transid) { + ret = BTRFS_LOG_FORCE_COMMIT; + goto out_unlock; + } + + /* + * a brute force approach to making sure we get the most uptodate + * copies of everything. + */ + if (S_ISDIR(inode->vfs_inode.i_mode)) { + clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); + if (ctx->logged_before) + ret = drop_inode_items(trans, log, path, inode, + BTRFS_XATTR_ITEM_KEY); + } else { + if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { + /* + * Make sure the new inode item we write to the log has + * the same isize as the current one (if it exists). + * This is necessary to prevent data loss after log + * replay, and also to prevent doing a wrong expanding + * truncate - for e.g. create file, write 4K into offset + * 0, fsync, write 4K into offset 4096, add hard link, + * fsync some other file (to sync log), power fail - if + * we use the inode's current i_size, after log replay + * we get a 8Kb file, with the last 4Kb extent as a hole + * (zeroes), as if an expanding truncate happened, + * instead of getting a file of 4Kb only. + */ + ret = logged_inode_size(log, inode, path, &logged_isize); + if (ret) + goto out_unlock; + } + if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &inode->runtime_flags)) { + if (inode_only == LOG_INODE_EXISTS) { + max_key.type = BTRFS_XATTR_ITEM_KEY; + if (ctx->logged_before) + ret = drop_inode_items(trans, log, path, + inode, max_key.type); + } else { + clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &inode->runtime_flags); + clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &inode->runtime_flags); + if (ctx->logged_before) + ret = truncate_inode_items(trans, log, + inode, 0, 0); + } + } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &inode->runtime_flags) || + inode_only == LOG_INODE_EXISTS) { + if (inode_only == LOG_INODE_ALL) + fast_search = true; + max_key.type = BTRFS_XATTR_ITEM_KEY; + if (ctx->logged_before) + ret = drop_inode_items(trans, log, path, inode, + max_key.type); + } else { + if (inode_only == LOG_INODE_ALL) + fast_search = true; + inode_item_dropped = false; + goto log_extents; + } + + } + if (ret) + goto out_unlock; + + /* + * If we are logging a directory in full mode, collect the delayed items + * before iterating the subvolume tree, so that we don't miss any new + * dir index items in case they get flushed while or right after we are + * iterating the subvolume tree. + */ + if (full_dir_logging && !ctx->logging_new_delayed_dentries) + btrfs_log_get_delayed_items(inode, &delayed_ins_list, + &delayed_del_list); + + ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, + path, dst_path, logged_isize, + inode_only, ctx, + &need_log_inode_item); + if (ret) + goto out_unlock; + + btrfs_release_path(path); + btrfs_release_path(dst_path); + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + if (ret) + goto out_unlock; + xattrs_logged = true; + if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { + btrfs_release_path(path); + btrfs_release_path(dst_path); + ret = btrfs_log_holes(trans, inode, path); + if (ret) + goto out_unlock; + } +log_extents: + btrfs_release_path(path); + btrfs_release_path(dst_path); + if (need_log_inode_item) { + ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); + if (ret) + goto out_unlock; + /* + * If we are doing a fast fsync and the inode was logged before + * in this transaction, we don't need to log the xattrs because + * they were logged before. If xattrs were added, changed or + * deleted since the last time we logged the inode, then we have + * already logged them because the inode had the runtime flag + * BTRFS_INODE_COPY_EVERYTHING set. + */ + if (!xattrs_logged && inode->logged_trans < trans->transid) { + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + if (ret) + goto out_unlock; + btrfs_release_path(path); + } + } + if (fast_search) { + ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); + if (ret) + goto out_unlock; + } else if (inode_only == LOG_INODE_ALL) { + struct extent_map *em, *n; + + write_lock(&em_tree->lock); + list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) + list_del_init(&em->list); + write_unlock(&em_tree->lock); + } + + if (full_dir_logging) { + ret = log_directory_changes(trans, inode, path, dst_path, ctx); + if (ret) + goto out_unlock; + ret = log_delayed_insertion_items(trans, inode, path, + &delayed_ins_list, ctx); + if (ret) + goto out_unlock; + ret = log_delayed_deletion_items(trans, inode, path, + &delayed_del_list, ctx); + if (ret) + goto out_unlock; + } + + spin_lock(&inode->lock); + inode->logged_trans = trans->transid; + /* + * Don't update last_log_commit if we logged that an inode exists. + * We do this for three reasons: + * + * 1) We might have had buffered writes to this inode that were + * flushed and had their ordered extents completed in this + * transaction, but we did not previously log the inode with + * LOG_INODE_ALL. Later the inode was evicted and after that + * it was loaded again and this LOG_INODE_EXISTS log operation + * happened. We must make sure that if an explicit fsync against + * the inode is performed later, it logs the new extents, an + * updated inode item, etc, and syncs the log. The same logic + * applies to direct IO writes instead of buffered writes. + * + * 2) When we log the inode with LOG_INODE_EXISTS, its inode item + * is logged with an i_size of 0 or whatever value was logged + * before. If later the i_size of the inode is increased by a + * truncate operation, the log is synced through an fsync of + * some other inode and then finally an explicit fsync against + * this inode is made, we must make sure this fsync logs the + * inode with the new i_size, the hole between old i_size and + * the new i_size, and syncs the log. + * + * 3) If we are logging that an ancestor inode exists as part of + * logging a new name from a link or rename operation, don't update + * its last_log_commit - otherwise if an explicit fsync is made + * against an ancestor, the fsync considers the inode in the log + * and doesn't sync the log, resulting in the ancestor missing after + * a power failure unless the log was synced as part of an fsync + * against any other unrelated inode. + */ + if (inode_only != LOG_INODE_EXISTS) + inode->last_log_commit = inode->last_sub_trans; + spin_unlock(&inode->lock); + + /* + * Reset the last_reflink_trans so that the next fsync does not need to + * go through the slower path when logging extents and their checksums. + */ + if (inode_only == LOG_INODE_ALL) + inode->last_reflink_trans = 0; + +out_unlock: + mutex_unlock(&inode->log_mutex); +out: + btrfs_free_path(path); + btrfs_free_path(dst_path); + + if (ret) + free_conflicting_inodes(ctx); + else + ret = log_conflicting_inodes(trans, inode->root, ctx); + + if (full_dir_logging && !ctx->logging_new_delayed_dentries) { + if (!ret) + ret = log_new_delayed_dentries(trans, inode, + &delayed_ins_list, ctx); + + btrfs_log_put_delayed_items(inode, &delayed_ins_list, + &delayed_del_list); + } + + return ret; +} + +static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_root *root = inode->root; + const u64 ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->skip_locking = 1; + path->search_commit_root = 1; + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + u32 cur_offset = 0; + u32 item_size; + unsigned long ptr; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ + if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) + break; + + item_size = btrfs_item_size(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); + while (cur_offset < item_size) { + struct btrfs_key inode_key; + struct inode *dir_inode; + + inode_key.type = BTRFS_INODE_ITEM_KEY; + inode_key.offset = 0; + + if (key.type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *) + (ptr + cur_offset); + inode_key.objectid = btrfs_inode_extref_parent( + leaf, extref); + cur_offset += sizeof(*extref); + cur_offset += btrfs_inode_extref_name_len(leaf, + extref); + } else { + inode_key.objectid = key.offset; + cur_offset = item_size; + } + + dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, + root); + /* + * If the parent inode was deleted, return an error to + * fallback to a transaction commit. This is to prevent + * getting an inode that was moved from one parent A to + * a parent B, got its former parent A deleted and then + * it got fsync'ed, from existing at both parents after + * a log replay (and the old parent still existing). + * Example: + * + * mkdir /mnt/A + * mkdir /mnt/B + * touch /mnt/B/bar + * sync + * mv /mnt/B/bar /mnt/A/bar + * mv -T /mnt/A /mnt/B + * fsync /mnt/B/bar + * + * + * If we ignore the old parent B which got deleted, + * after a log replay we would have file bar linked + * at both parents and the old parent B would still + * exist. + */ + if (IS_ERR(dir_inode)) { + ret = PTR_ERR(dir_inode); + goto out; + } + + if (!need_log_inode(trans, BTRFS_I(dir_inode))) { + btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + continue; + } + + ctx->log_new_dentries = false; + ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), + LOG_INODE_ALL, ctx); + if (!ret && ctx->log_new_dentries) + ret = log_new_dir_dentries(trans, + BTRFS_I(dir_inode), ctx); + btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + if (ret) + goto out; + } + path->slots[0]++; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static int log_new_ancestors(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_key found_key; + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + + while (true) { + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *leaf; + int slot; + struct btrfs_key search_key; + struct inode *inode; + u64 ino; + int ret = 0; + + btrfs_release_path(path); + + ino = found_key.offset; + + search_key.objectid = found_key.offset; + search_key.type = BTRFS_INODE_ITEM_KEY; + search_key.offset = 0; + inode = btrfs_iget(fs_info->sb, ino, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (BTRFS_I(inode)->generation >= trans->transid && + need_log_inode(trans, BTRFS_I(inode))) + ret = btrfs_log_inode(trans, BTRFS_I(inode), + LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(BTRFS_I(inode)); + if (ret) + return ret; + + if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) + break; + + search_key.type = BTRFS_INODE_REF_KEY; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + return ret; + + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + return -ENOENT; + leaf = path->nodes[0]; + slot = path->slots[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid != search_key.objectid || + found_key.type != BTRFS_INODE_REF_KEY) + return -ENOENT; + } + return 0; +} + +static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct dentry *parent, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *root = inode->root; + struct dentry *old_parent = NULL; + struct super_block *sb = inode->vfs_inode.i_sb; + int ret = 0; + + while (true) { + if (!parent || d_really_is_negative(parent) || + sb != parent->d_sb) + break; + + inode = BTRFS_I(d_inode(parent)); + if (root != inode->root) + break; + + if (inode->generation >= trans->transid && + need_log_inode(trans, inode)) { + ret = btrfs_log_inode(trans, inode, + LOG_INODE_EXISTS, ctx); + if (ret) + break; + } + if (IS_ROOT(parent)) + break; + + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; + } + dput(old_parent); + + return ret; +} + +static int log_all_new_ancestors(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct dentry *parent, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *root = inode->root; + const u64 ino = btrfs_ino(inode); + struct btrfs_path *path; + struct btrfs_key search_key; + int ret; + + /* + * For a single hard link case, go through a fast path that does not + * need to iterate the fs/subvolume tree. + */ + if (inode->vfs_inode.i_nlink < 2) + return log_new_ancestors_fast(trans, inode, parent, ctx); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + search_key.objectid = ino; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = 0; +again: + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; + if (ret == 0) + path->slots[0]++; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_key found_key; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid != ino || + found_key.type > BTRFS_INODE_EXTREF_KEY) + break; + + /* + * Don't deal with extended references because they are rare + * cases and too complex to deal with (we would need to keep + * track of which subitem we are processing for each item in + * this loop, etc). So just return some error to fallback to + * a transaction commit. + */ + if (found_key.type == BTRFS_INODE_EXTREF_KEY) { + ret = -EMLINK; + goto out; + } + + /* + * Logging ancestors needs to do more searches on the fs/subvol + * tree, so it releases the path as needed to avoid deadlocks. + * Keep track of the last inode ref key and resume from that key + * after logging all new ancestors for the current hard link. + */ + memcpy(&search_key, &found_key, sizeof(search_key)); + + ret = log_new_ancestors(trans, root, path, ctx); + if (ret) + goto out; + btrfs_release_path(path); + goto again; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper function around btrfs_log_inode to make sure newly created + * parent directories also end up in the log. A minimal inode and backref + * only logging is done of any parent directories that are older than + * the last committed transaction + */ +static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct dentry *parent, + int inode_only, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; + bool log_dentries = false; + + if (btrfs_test_opt(fs_info, NOTREELOG)) { + ret = BTRFS_LOG_FORCE_COMMIT; + goto end_no_trans; + } + + if (btrfs_root_refs(&root->root_item) == 0) { + ret = BTRFS_LOG_FORCE_COMMIT; + goto end_no_trans; + } + + /* + * Skip already logged inodes or inodes corresponding to tmpfiles + * (since logging them is pointless, a link count of 0 means they + * will never be accessible). + */ + if ((btrfs_inode_in_log(inode, trans->transid) && + list_empty(&ctx->ordered_extents)) || + inode->vfs_inode.i_nlink == 0) { + ret = BTRFS_NO_LOG_SYNC; + goto end_no_trans; + } + + ret = start_log_trans(trans, root, ctx); + if (ret) + goto end_no_trans; + + ret = btrfs_log_inode(trans, inode, inode_only, ctx); + if (ret) + goto end_trans; + + /* + * for regular files, if its inode is already on disk, we don't + * have to worry about the parents at all. This is because + * we can use the last_unlink_trans field to record renames + * and other fun in this file. + */ + if (S_ISREG(inode->vfs_inode.i_mode) && + inode->generation < trans->transid && + inode->last_unlink_trans < trans->transid) { + ret = 0; + goto end_trans; + } + + if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) + log_dentries = true; + + /* + * On unlink we must make sure all our current and old parent directory + * inodes are fully logged. This is to prevent leaving dangling + * directory index entries in directories that were our parents but are + * not anymore. Not doing this results in old parent directory being + * impossible to delete after log replay (rmdir will always fail with + * error -ENOTEMPTY). + * + * Example 1: + * + * mkdir testdir + * touch testdir/foo + * ln testdir/foo testdir/bar + * sync + * unlink testdir/bar + * xfs_io -c fsync testdir/foo + * + * mount fs, triggers log replay + * + * If we don't log the parent directory (testdir), after log replay the + * directory still has an entry pointing to the file inode using the bar + * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and + * the file inode has a link count of 1. + * + * Example 2: + * + * mkdir testdir + * touch foo + * ln foo testdir/foo2 + * ln foo testdir/foo3 + * sync + * unlink testdir/foo3 + * xfs_io -c fsync foo + * + * mount fs, triggers log replay + * + * Similar as the first example, after log replay the parent directory + * testdir still has an entry pointing to the inode file with name foo3 + * but the file inode does not have a matching BTRFS_INODE_REF_KEY item + * and has a link count of 2. + */ + if (inode->last_unlink_trans >= trans->transid) { + ret = btrfs_log_all_parents(trans, inode, ctx); + if (ret) + goto end_trans; + } + + ret = log_all_new_ancestors(trans, inode, parent, ctx); + if (ret) + goto end_trans; + + if (log_dentries) + ret = log_new_dir_dentries(trans, inode, ctx); + else + ret = 0; +end_trans: + if (ret < 0) { + btrfs_set_log_full_commit(trans); + ret = BTRFS_LOG_FORCE_COMMIT; + } + + if (ret) + btrfs_remove_log_ctx(root, ctx); + btrfs_end_log_trans(root); +end_no_trans: + return ret; +} + +/* + * it is not safe to log dentry if the chunk root has added new + * chunks. This returns 0 if the dentry was logged, and 1 otherwise. + * If this returns 1, you must commit the transaction to safely get your + * data on disk. + */ +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, + struct dentry *dentry, + struct btrfs_log_ctx *ctx) +{ + struct dentry *parent = dget_parent(dentry); + int ret; + + ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, + LOG_INODE_ALL, ctx); + dput(parent); + + return ret; +} + +/* + * should be called during mount to recover any replay any log trees + * from the FS + */ +int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) +{ + int ret; + struct btrfs_path *path; + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *log; + struct btrfs_fs_info *fs_info = log_root_tree->fs_info; + struct walk_control wc = { + .process_func = process_one_buffer, + .stage = LOG_WALK_PIN_ONLY, + }; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); + + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto error; + } + + wc.trans = trans; + wc.pin = 1; + + ret = walk_log_tree(trans, log_root_tree, &wc); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto error; + } + +again: + key.objectid = BTRFS_TREE_LOG_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_ROOT_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); + + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto error; + } + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + btrfs_release_path(path); + if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) + break; + + log = btrfs_read_tree_root(log_root_tree, &found_key); + if (IS_ERR(log)) { + ret = PTR_ERR(log); + btrfs_abort_transaction(trans, ret); + goto error; + } + + wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, + true); + if (IS_ERR(wc.replay_dest)) { + ret = PTR_ERR(wc.replay_dest); + + /* + * We didn't find the subvol, likely because it was + * deleted. This is ok, simply skip this log and go to + * the next one. + * + * We need to exclude the root because we can't have + * other log replays overwriting this log as we'll read + * it back in a few more times. This will keep our + * block from being modified, and we'll just bail for + * each subsequent pass. + */ + if (ret == -ENOENT) + ret = btrfs_pin_extent_for_log_replay(trans, + log->node->start, + log->node->len); + btrfs_put_root(log); + + if (!ret) + goto next; + btrfs_abort_transaction(trans, ret); + goto error; + } + + wc.replay_dest->log_root = log; + ret = btrfs_record_root_in_trans(trans, wc.replay_dest); + if (ret) + /* The loop needs to continue due to the root refs */ + btrfs_abort_transaction(trans, ret); + else + ret = walk_log_tree(trans, log, &wc); + + if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { + ret = fixup_inode_link_counts(trans, wc.replay_dest, + path); + if (ret) + btrfs_abort_transaction(trans, ret); + } + + if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { + struct btrfs_root *root = wc.replay_dest; + + btrfs_release_path(path); + + /* + * We have just replayed everything, and the highest + * objectid of fs roots probably has changed in case + * some inode_item's got replayed. + * + * root->objectid_mutex is not acquired as log replay + * could only happen during mount. + */ + ret = btrfs_init_root_free_objectid(root); + if (ret) + btrfs_abort_transaction(trans, ret); + } + + wc.replay_dest->log_root = NULL; + btrfs_put_root(wc.replay_dest); + btrfs_put_root(log); + + if (ret) + goto error; +next: + if (found_key.offset == 0) + break; + key.offset = found_key.offset - 1; + } + btrfs_release_path(path); + + /* step one is to pin it all, step two is to replay just inodes */ + if (wc.pin) { + wc.pin = 0; + wc.process_func = replay_one_buffer; + wc.stage = LOG_WALK_REPLAY_INODES; + goto again; + } + /* step three is to replay everything */ + if (wc.stage < LOG_WALK_REPLAY_ALL) { + wc.stage++; + goto again; + } + + btrfs_free_path(path); + + /* step 4: commit the transaction, which also unpins the blocks */ + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + + log_root_tree->log_root = NULL; + clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); + btrfs_put_root(log_root_tree); + + return 0; +error: + if (wc.trans) + btrfs_end_transaction(wc.trans); + clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); + btrfs_free_path(path); + return ret; +} + +/* + * there are some corner cases where we want to force a full + * commit instead of allowing a directory to be logged. + * + * They revolve around files there were unlinked from the directory, and + * this function updates the parent directory so that a full commit is + * properly done if it is fsync'd later after the unlinks are done. + * + * Must be called before the unlink operations (updates to the subvolume tree, + * inodes, etc) are done. + */ +void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, struct btrfs_inode *inode, + bool for_rename) +{ + /* + * when we're logging a file, if it hasn't been renamed + * or unlinked, and its inode is fully committed on disk, + * we don't have to worry about walking up the directory chain + * to log its parents. + * + * So, we use the last_unlink_trans field to put this transid + * into the file. When the file is logged we check it and + * don't log the parents if the file is fully on disk. + */ + mutex_lock(&inode->log_mutex); + inode->last_unlink_trans = trans->transid; + mutex_unlock(&inode->log_mutex); + + if (!for_rename) + return; + + /* + * If this directory was already logged, any new names will be logged + * with btrfs_log_new_name() and old names will be deleted from the log + * tree with btrfs_del_dir_entries_in_log() or with + * btrfs_del_inode_ref_in_log(). + */ + if (inode_logged(trans, dir, NULL) == 1) + return; + + /* + * If the inode we're about to unlink was logged before, the log will be + * properly updated with the new name with btrfs_log_new_name() and the + * old name removed with btrfs_del_dir_entries_in_log() or with + * btrfs_del_inode_ref_in_log(). + */ + if (inode_logged(trans, inode, NULL) == 1) + return; + + /* + * when renaming files across directories, if the directory + * there we're unlinking from gets fsync'd later on, there's + * no way to find the destination directory later and fsync it + * properly. So, we have to be conservative and force commits + * so the new name gets discovered. + */ + mutex_lock(&dir->log_mutex); + dir->last_unlink_trans = trans->transid; + mutex_unlock(&dir->log_mutex); +} + +/* + * Make sure that if someone attempts to fsync the parent directory of a deleted + * snapshot, it ends up triggering a transaction commit. This is to guarantee + * that after replaying the log tree of the parent directory's root we will not + * see the snapshot anymore and at log replay time we will not see any log tree + * corresponding to the deleted snapshot's root, which could lead to replaying + * it after replaying the log tree of the parent directory (which would replay + * the snapshot delete operation). + * + * Must be called before the actual snapshot destroy operation (updates to the + * parent root and tree of tree roots trees, etc) are done. + */ +void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir) +{ + mutex_lock(&dir->log_mutex); + dir->last_unlink_trans = trans->transid; + mutex_unlock(&dir->log_mutex); +} + +/* + * Update the log after adding a new name for an inode. + * + * @trans: Transaction handle. + * @old_dentry: The dentry associated with the old name and the old + * parent directory. + * @old_dir: The inode of the previous parent directory for the case + * of a rename. For a link operation, it must be NULL. + * @old_dir_index: The index number associated with the old name, meaningful + * only for rename operations (when @old_dir is not NULL). + * Ignored for link operations. + * @parent: The dentry associated with the directory under which the + * new name is located. + * + * Call this after adding a new name for an inode, as a result of a link or + * rename operation, and it will properly update the log to reflect the new name. + */ +void btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct dentry *old_dentry, struct btrfs_inode *old_dir, + u64 old_dir_index, struct dentry *parent) +{ + struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry)); + struct btrfs_root *root = inode->root; + struct btrfs_log_ctx ctx; + bool log_pinned = false; + int ret; + + /* + * this will force the logging code to walk the dentry chain + * up for the file + */ + if (!S_ISDIR(inode->vfs_inode.i_mode)) + inode->last_unlink_trans = trans->transid; + + /* + * if this inode hasn't been logged and directory we're renaming it + * from hasn't been logged, we don't need to log it + */ + ret = inode_logged(trans, inode, NULL); + if (ret < 0) { + goto out; + } else if (ret == 0) { + if (!old_dir) + return; + /* + * If the inode was not logged and we are doing a rename (old_dir is not + * NULL), check if old_dir was logged - if it was not we can return and + * do nothing. + */ + ret = inode_logged(trans, old_dir, NULL); + if (ret < 0) + goto out; + else if (ret == 0) + return; + } + ret = 0; + + /* + * If we are doing a rename (old_dir is not NULL) from a directory that + * was previously logged, make sure that on log replay we get the old + * dir entry deleted. This is needed because we will also log the new + * name of the renamed inode, so we need to make sure that after log + * replay we don't end up with both the new and old dir entries existing. + */ + if (old_dir && old_dir->logged_trans == trans->transid) { + struct btrfs_root *log = old_dir->root->log_root; + struct btrfs_path *path; + struct fscrypt_name fname; + + ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); + + ret = fscrypt_setup_filename(&old_dir->vfs_inode, + &old_dentry->d_name, 0, &fname); + if (ret) + goto out; + /* + * We have two inodes to update in the log, the old directory and + * the inode that got renamed, so we must pin the log to prevent + * anyone from syncing the log until we have updated both inodes + * in the log. + */ + ret = join_running_log_trans(root); + /* + * At least one of the inodes was logged before, so this should + * not fail, but if it does, it's not serious, just bail out and + * mark the log for a full commit. + */ + if (WARN_ON_ONCE(ret < 0)) { + fscrypt_free_filename(&fname); + goto out; + } + + log_pinned = true; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + fscrypt_free_filename(&fname); + goto out; + } + + /* + * Other concurrent task might be logging the old directory, + * as it can be triggered when logging other inode that had or + * still has a dentry in the old directory. We lock the old + * directory's log_mutex to ensure the deletion of the old + * name is persisted, because during directory logging we + * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of + * the old name's dir index item is in the delayed items, so + * it could be missed by an in progress directory logging. + */ + mutex_lock(&old_dir->log_mutex); + ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), + &fname.disk_name, old_dir_index); + if (ret > 0) { + /* + * The dentry does not exist in the log, so record its + * deletion. + */ + btrfs_release_path(path); + ret = insert_dir_log_key(trans, log, path, + btrfs_ino(old_dir), + old_dir_index, old_dir_index); + } + mutex_unlock(&old_dir->log_mutex); + + btrfs_free_path(path); + fscrypt_free_filename(&fname); + if (ret < 0) + goto out; + } + + btrfs_init_log_ctx(&ctx, &inode->vfs_inode); + ctx.logging_new_name = true; + /* + * We don't care about the return value. If we fail to log the new name + * then we know the next attempt to sync the log will fallback to a full + * transaction commit (due to a call to btrfs_set_log_full_commit()), so + * we don't need to worry about getting a log committed that has an + * inconsistent state after a rename operation. + */ + btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); + ASSERT(list_empty(&ctx.conflict_inodes)); +out: + /* + * If an error happened mark the log for a full commit because it's not + * consistent and up to date or we couldn't find out if one of the + * inodes was logged before in this transaction. Do it before unpinning + * the log, to avoid any races with someone else trying to commit it. + */ + if (ret < 0) + btrfs_set_log_full_commit(trans); + if (log_pinned) + btrfs_end_log_trans(root); +} + diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h new file mode 100644 index 0000000000..a550a8a375 --- /dev/null +++ b/fs/btrfs/tree-log.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + */ + +#ifndef BTRFS_TREE_LOG_H +#define BTRFS_TREE_LOG_H + +#include "messages.h" +#include "ctree.h" +#include "transaction.h" + +/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ +#define BTRFS_NO_LOG_SYNC 256 + +/* + * We can't use the tree log for whatever reason, force a transaction commit. + * We use a negative value because there are functions through the logging code + * that need to return an error (< 0 value), false (0) or true (1). Any negative + * value will do, as it will cause the log to be marked for a full sync. + */ +#define BTRFS_LOG_FORCE_COMMIT (-(MAX_ERRNO + 1)) + +struct btrfs_log_ctx { + int log_ret; + int log_transid; + bool log_new_dentries; + bool logging_new_name; + bool logging_new_delayed_dentries; + /* Indicate if the inode being logged was logged before. */ + bool logged_before; + struct inode *inode; + struct list_head list; + /* Only used for fast fsyncs. */ + struct list_head ordered_extents; + struct list_head conflict_inodes; + int num_conflict_inodes; + bool logging_conflict_inodes; +}; + +static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, + struct inode *inode) +{ + ctx->log_ret = 0; + ctx->log_transid = 0; + ctx->log_new_dentries = false; + ctx->logging_new_name = false; + ctx->logging_new_delayed_dentries = false; + ctx->logged_before = false; + ctx->inode = inode; + INIT_LIST_HEAD(&ctx->list); + INIT_LIST_HEAD(&ctx->ordered_extents); + INIT_LIST_HEAD(&ctx->conflict_inodes); + ctx->num_conflict_inodes = 0; + ctx->logging_conflict_inodes = false; +} + +static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; + + ASSERT(inode_is_locked(ctx->inode)); + + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + btrfs_put_ordered_extent(ordered); + } +} + +static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans) +{ + WRITE_ONCE(trans->fs_info->last_trans_log_full_commit, trans->transid); +} + +static inline int btrfs_need_log_full_commit(struct btrfs_trans_handle *trans) +{ + return READ_ONCE(trans->fs_info->last_trans_log_full_commit) == + trans->transid; +} + +int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_log_ctx *ctx); +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_recover_log_trees(struct btrfs_root *tree_root); +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, + struct dentry *dentry, + struct btrfs_log_ctx *ctx); +void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const struct fscrypt_str *name, + struct btrfs_inode *dir, u64 index); +void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const struct fscrypt_str *name, + struct btrfs_inode *inode, u64 dirid); +void btrfs_end_log_trans(struct btrfs_root *root); +void btrfs_pin_log_trans(struct btrfs_root *root); +void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, struct btrfs_inode *inode, + bool for_rename); +void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir); +void btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct dentry *old_dentry, struct btrfs_inode *old_dir, + u64 old_dir_index, struct dentry *parent); + +#endif diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c new file mode 100644 index 0000000000..3df6153d5d --- /dev/null +++ b/fs/btrfs/tree-mod-log.c @@ -0,0 +1,1114 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "messages.h" +#include "tree-mod-log.h" +#include "disk-io.h" +#include "fs.h" +#include "accessors.h" +#include "tree-checker.h" + +struct tree_mod_root { + u64 logical; + u8 level; +}; + +struct tree_mod_elem { + struct rb_node node; + u64 logical; + u64 seq; + enum btrfs_mod_log_op op; + + /* + * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS + * operations. + */ + int slot; + + /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */ + u64 generation; + + /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */ + struct btrfs_disk_key key; + u64 blockptr; + + /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */ + struct { + int dst_slot; + int nr_items; + } move; + + /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */ + struct tree_mod_root old_root; +}; + +/* + * Pull a new tree mod seq number for our operation. + */ +static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) +{ + return atomic64_inc_return(&fs_info->tree_mod_seq); +} + +/* + * This adds a new blocker to the tree mod log's blocker list if the @elem + * passed does not already have a sequence number set. So when a caller expects + * to record tree modifications, it should ensure to set elem->seq to zero + * before calling btrfs_get_tree_mod_seq. + * Returns a fresh, unused tree log modification sequence number, even if no new + * blocker was added. + */ +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct btrfs_seq_list *elem) +{ + write_lock(&fs_info->tree_mod_log_lock); + if (!elem->seq) { + elem->seq = btrfs_inc_tree_mod_seq(fs_info); + list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); + set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); + } + write_unlock(&fs_info->tree_mod_log_lock); + + return elem->seq; +} + +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct btrfs_seq_list *elem) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct rb_node *next; + struct tree_mod_elem *tm; + u64 min_seq = BTRFS_SEQ_LAST; + u64 seq_putting = elem->seq; + + if (!seq_putting) + return; + + write_lock(&fs_info->tree_mod_log_lock); + list_del(&elem->list); + elem->seq = 0; + + if (list_empty(&fs_info->tree_mod_seq_list)) { + clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); + } else { + struct btrfs_seq_list *first; + + first = list_first_entry(&fs_info->tree_mod_seq_list, + struct btrfs_seq_list, list); + if (seq_putting > first->seq) { + /* + * Blocker with lower sequence number exists, we cannot + * remove anything from the log. + */ + write_unlock(&fs_info->tree_mod_log_lock); + return; + } + min_seq = first->seq; + } + + /* + * Anything that's lower than the lowest existing (read: blocked) + * sequence number can be removed from the tree. + */ + tm_root = &fs_info->tree_mod_log; + for (node = rb_first(tm_root); node; node = next) { + next = rb_next(node); + tm = rb_entry(node, struct tree_mod_elem, node); + if (tm->seq >= min_seq) + continue; + rb_erase(node, tm_root); + kfree(tm); + } + write_unlock(&fs_info->tree_mod_log_lock); +} + +/* + * Key order of the log: + * node/leaf start address -> sequence + * + * The 'start address' is the logical address of the *new* root node for root + * replace operations, or the logical address of the affected block for all + * other operations. + */ +static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, + struct tree_mod_elem *tm) +{ + struct rb_root *tm_root; + struct rb_node **new; + struct rb_node *parent = NULL; + struct tree_mod_elem *cur; + + lockdep_assert_held_write(&fs_info->tree_mod_log_lock); + + tm->seq = btrfs_inc_tree_mod_seq(fs_info); + + tm_root = &fs_info->tree_mod_log; + new = &tm_root->rb_node; + while (*new) { + cur = rb_entry(*new, struct tree_mod_elem, node); + parent = *new; + if (cur->logical < tm->logical) + new = &((*new)->rb_left); + else if (cur->logical > tm->logical) + new = &((*new)->rb_right); + else if (cur->seq < tm->seq) + new = &((*new)->rb_left); + else if (cur->seq > tm->seq) + new = &((*new)->rb_right); + else + return -EEXIST; + } + + rb_link_node(&tm->node, parent, new); + rb_insert_color(&tm->node, tm_root); + return 0; +} + +/* + * Determines if logging can be omitted. Returns true if it can. Otherwise, it + * returns false with the tree_mod_log_lock acquired. The caller must hold + * this until all tree mod log insertions are recorded in the rb tree and then + * write unlock fs_info::tree_mod_log_lock. + */ +static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + return true; + if (eb && btrfs_header_level(eb) == 0) + return true; + + write_lock(&fs_info->tree_mod_log_lock); + if (list_empty(&(fs_info)->tree_mod_seq_list)) { + write_unlock(&fs_info->tree_mod_log_lock); + return true; + } + + return false; +} + +/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ +static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + return false; + if (eb && btrfs_header_level(eb) == 0) + return false; + + return true; +} + +static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, + int slot, + enum btrfs_mod_log_op op) +{ + struct tree_mod_elem *tm; + + tm = kzalloc(sizeof(*tm), GFP_NOFS); + if (!tm) + return NULL; + + tm->logical = eb->start; + if (op != BTRFS_MOD_LOG_KEY_ADD) { + btrfs_node_key(eb, &tm->key, slot); + tm->blockptr = btrfs_node_blockptr(eb, slot); + } + tm->op = op; + tm->slot = slot; + tm->generation = btrfs_node_ptr_generation(eb, slot); + RB_CLEAR_NODE(&tm->node); + + return tm; +} + +int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, + enum btrfs_mod_log_op op) +{ + struct tree_mod_elem *tm; + int ret = 0; + + if (!tree_mod_need_log(eb->fs_info, eb)) + return 0; + + tm = alloc_tree_mod_elem(eb, slot, op); + if (!tm) + ret = -ENOMEM; + + if (tree_mod_dont_log(eb->fs_info, eb)) { + kfree(tm); + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ + return 0; + } else if (ret != 0) { + /* + * We previously failed to allocate memory and we need to log, + * so we have to fail. + */ + goto out_unlock; + } + + ret = tree_mod_log_insert(eb->fs_info, tm); +out_unlock: + write_unlock(&eb->fs_info->tree_mod_log_lock); + if (ret) + kfree(tm); + + return ret; +} + +static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, + int dst_slot, int src_slot, + int nr_items) +{ + struct tree_mod_elem *tm; + + tm = kzalloc(sizeof(*tm), GFP_NOFS); + if (!tm) + return ERR_PTR(-ENOMEM); + + tm->logical = eb->start; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = BTRFS_MOD_LOG_MOVE_KEYS; + RB_CLEAR_NODE(&tm->node); + + return tm; +} + +int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, + int dst_slot, int src_slot, + int nr_items) +{ + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int ret = 0; + int i; + bool locked = false; + + if (!tree_mod_need_log(eb->fs_info, eb)) + return 0; + + tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); + if (!tm_list) { + ret = -ENOMEM; + goto lock; + } + + tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items); + if (IS_ERR(tm)) { + ret = PTR_ERR(tm); + tm = NULL; + goto lock; + } + + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING); + if (!tm_list[i]) { + ret = -ENOMEM; + goto lock; + } + } + +lock: + if (tree_mod_dont_log(eb->fs_info, eb)) { + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ + ret = 0; + goto free_tms; + } + locked = true; + + /* + * We previously failed to allocate memory and we need to log, so we + * have to fail. + */ + if (ret != 0) + goto free_tms; + + /* + * When we override something during the move, we log these removals. + * This can only happen when we move towards the beginning of the + * buffer, i.e. dst_slot < src_slot. + */ + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + ret = tree_mod_log_insert(eb->fs_info, tm_list[i]); + if (ret) + goto free_tms; + } + + ret = tree_mod_log_insert(eb->fs_info, tm); + if (ret) + goto free_tms; + write_unlock(&eb->fs_info->tree_mod_log_lock); + kfree(tm_list); + + return 0; + +free_tms: + if (tm_list) { + for (i = 0; i < nr_items; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); + kfree(tm_list[i]); + } + } + if (locked) + write_unlock(&eb->fs_info->tree_mod_log_lock); + kfree(tm_list); + kfree(tm); + + return ret; +} + +static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct tree_mod_elem **tm_list, + int nritems) +{ + int i, j; + int ret; + + for (i = nritems - 1; i >= 0; i--) { + ret = tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) { + for (j = nritems - 1; j > i; j--) + rb_erase(&tm_list[j]->node, + &fs_info->tree_mod_log); + return ret; + } + } + + return 0; +} + +int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, + struct extent_buffer *new_root, + bool log_removal) +{ + struct btrfs_fs_info *fs_info = old_root->fs_info; + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int ret = 0; + int i; + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + if (log_removal && btrfs_header_level(old_root) > 0) { + nritems = btrfs_header_nritems(old_root); + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) { + ret = -ENOMEM; + goto lock; + } + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(old_root, i, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); + if (!tm_list[i]) { + ret = -ENOMEM; + goto lock; + } + } + } + + tm = kzalloc(sizeof(*tm), GFP_NOFS); + if (!tm) { + ret = -ENOMEM; + goto lock; + } + + tm->logical = new_root->start; + tm->old_root.logical = old_root->start; + tm->old_root.level = btrfs_header_level(old_root); + tm->generation = btrfs_header_generation(old_root); + tm->op = BTRFS_MOD_LOG_ROOT_REPLACE; + +lock: + if (tree_mod_dont_log(fs_info, NULL)) { + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ + ret = 0; + goto free_tms; + } else if (ret != 0) { + /* + * We previously failed to allocate memory and we need to log, + * so we have to fail. + */ + goto out_unlock; + } + + if (tm_list) + ret = tree_mod_log_free_eb(fs_info, tm_list, nritems); + if (!ret) + ret = tree_mod_log_insert(fs_info, tm); + +out_unlock: + write_unlock(&fs_info->tree_mod_log_lock); + if (ret) + goto free_tms; + kfree(tm_list); + + return ret; + +free_tms: + if (tm_list) { + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + } + kfree(tm); + + return ret; +} + +static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info, + u64 start, u64 min_seq, + bool smallest) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct tree_mod_elem *cur = NULL; + struct tree_mod_elem *found = NULL; + + read_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + node = tm_root->rb_node; + while (node) { + cur = rb_entry(node, struct tree_mod_elem, node); + if (cur->logical < start) { + node = node->rb_left; + } else if (cur->logical > start) { + node = node->rb_right; + } else if (cur->seq < min_seq) { + node = node->rb_left; + } else if (!smallest) { + /* We want the node with the highest seq */ + if (found) + BUG_ON(found->seq > cur->seq); + found = cur; + node = node->rb_left; + } else if (cur->seq > min_seq) { + /* We want the node with the smallest seq */ + if (found) + BUG_ON(found->seq < cur->seq); + found = cur; + node = node->rb_right; + } else { + found = cur; + break; + } + } + read_unlock(&fs_info->tree_mod_log_lock); + + return found; +} + +/* + * This returns the element from the log with the smallest time sequence + * value that's in the log (the oldest log item). Any element with a time + * sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, + u64 start, u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, true); +} + +/* + * This returns the element from the log with the largest time sequence + * value that's in the log (the most recent log item). Any element with + * a time sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info, + u64 start, u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, false); +} + +int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, + struct extent_buffer *src, + unsigned long dst_offset, + unsigned long src_offset, + int nr_items) +{ + struct btrfs_fs_info *fs_info = dst->fs_info; + int ret = 0; + struct tree_mod_elem **tm_list = NULL; + struct tree_mod_elem **tm_list_add = NULL; + struct tree_mod_elem **tm_list_rem = NULL; + int i; + bool locked = false; + struct tree_mod_elem *dst_move_tm = NULL; + struct tree_mod_elem *src_move_tm = NULL; + u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset; + u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items); + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) + return 0; + + tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) { + ret = -ENOMEM; + goto lock; + } + + if (dst_move_nr_items) { + dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items, + dst_offset, dst_move_nr_items); + if (IS_ERR(dst_move_tm)) { + ret = PTR_ERR(dst_move_tm); + dst_move_tm = NULL; + goto lock; + } + } + if (src_move_nr_items) { + src_move_tm = tree_mod_log_alloc_move(src, src_offset, + src_offset + nr_items, + src_move_nr_items); + if (IS_ERR(src_move_tm)) { + ret = PTR_ERR(src_move_tm); + src_move_tm = NULL; + goto lock; + } + } + + tm_list_add = tm_list; + tm_list_rem = tm_list + nr_items; + for (i = 0; i < nr_items; i++) { + tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, + BTRFS_MOD_LOG_KEY_REMOVE); + if (!tm_list_rem[i]) { + ret = -ENOMEM; + goto lock; + } + + tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, + BTRFS_MOD_LOG_KEY_ADD); + if (!tm_list_add[i]) { + ret = -ENOMEM; + goto lock; + } + } + +lock: + if (tree_mod_dont_log(fs_info, NULL)) { + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ + ret = 0; + goto free_tms; + } + locked = true; + + /* + * We previously failed to allocate memory and we need to log, so we + * have to fail. + */ + if (ret != 0) + goto free_tms; + + if (dst_move_tm) { + ret = tree_mod_log_insert(fs_info, dst_move_tm); + if (ret) + goto free_tms; + } + for (i = 0; i < nr_items; i++) { + ret = tree_mod_log_insert(fs_info, tm_list_rem[i]); + if (ret) + goto free_tms; + ret = tree_mod_log_insert(fs_info, tm_list_add[i]); + if (ret) + goto free_tms; + } + if (src_move_tm) { + ret = tree_mod_log_insert(fs_info, src_move_tm); + if (ret) + goto free_tms; + } + + write_unlock(&fs_info->tree_mod_log_lock); + kfree(tm_list); + + return 0; + +free_tms: + if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node)) + rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log); + kfree(dst_move_tm); + if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node)) + rb_erase(&src_move_tm->node, &fs_info->tree_mod_log); + kfree(src_move_tm); + if (tm_list) { + for (i = 0; i < nr_items * 2; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } + } + if (locked) + write_unlock(&fs_info->tree_mod_log_lock); + kfree(tm_list); + + return ret; +} + +int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) +{ + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int i; + int ret = 0; + + if (!tree_mod_need_log(eb->fs_info, eb)) + return 0; + + nritems = btrfs_header_nritems(eb); + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); + if (!tm_list) { + ret = -ENOMEM; + goto lock; + } + + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); + if (!tm_list[i]) { + ret = -ENOMEM; + goto lock; + } + } + +lock: + if (tree_mod_dont_log(eb->fs_info, eb)) { + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ + ret = 0; + goto free_tms; + } else if (ret != 0) { + /* + * We previously failed to allocate memory and we need to log, + * so we have to fail. + */ + goto out_unlock; + } + + ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); +out_unlock: + write_unlock(&eb->fs_info->tree_mod_log_lock); + if (ret) + goto free_tms; + kfree(tm_list); + + return 0; + +free_tms: + if (tm_list) { + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + } + + return ret; +} + +/* + * Returns the logical address of the oldest predecessor of the given root. + * Entries older than time_seq are ignored. + */ +static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root, + u64 time_seq) +{ + struct tree_mod_elem *tm; + struct tree_mod_elem *found = NULL; + u64 root_logical = eb_root->start; + bool looped = false; + + if (!time_seq) + return NULL; + + /* + * The very last operation that's logged for a root is the replacement + * operation (if it is replaced at all). This has the logical address + * of the *new* root, making it the very first operation that's logged + * for this root. + */ + while (1) { + tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical, + time_seq); + if (!looped && !tm) + return NULL; + /* + * If there are no tree operation for the oldest root, we simply + * return it. This should only happen if that (old) root is at + * level 0. + */ + if (!tm) + break; + + /* + * If there's an operation that's not a root replacement, we + * found the oldest version of our root. Normally, we'll find a + * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. + */ + if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE) + break; + + found = tm; + root_logical = tm->old_root.logical; + looped = true; + } + + /* If there's no old root to return, return what we found instead */ + if (!found) + found = tm; + + return found; +} + + +/* + * tm is a pointer to the first operation to rewind within eb. Then, all + * previous operations will be rewound (until we reach something older than + * time_seq). + */ +static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + u64 time_seq, + struct tree_mod_elem *first_tm) +{ + u32 n; + struct rb_node *next; + struct tree_mod_elem *tm = first_tm; + unsigned long o_dst; + unsigned long o_src; + unsigned long p_size = sizeof(struct btrfs_key_ptr); + /* + * max_slot tracks the maximum valid slot of the rewind eb at every + * step of the rewind. This is in contrast with 'n' which eventually + * matches the number of items, but can be wrong during moves or if + * removes overlap on already valid slots (which is probably separately + * a bug). We do this to validate the offsets of memmoves for rewinding + * moves and detect invalid memmoves. + * + * Since a rewind eb can start empty, max_slot is a signed integer with + * a special meaning for -1, which is that no slot is valid to move out + * of. Any other negative value is invalid. + */ + int max_slot; + int move_src_end_slot; + int move_dst_end_slot; + + n = btrfs_header_nritems(eb); + max_slot = n - 1; + read_lock(&fs_info->tree_mod_log_lock); + while (tm && tm->seq >= time_seq) { + ASSERT(max_slot >= -1); + /* + * All the operations are recorded with the operator used for + * the modification. As we're going backwards, we do the + * opposite of each operation here. + */ + switch (tm->op) { + case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING: + BUG_ON(tm->slot < n); + fallthrough; + case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING: + case BTRFS_MOD_LOG_KEY_REMOVE: + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + n++; + if (tm->slot > max_slot) + max_slot = tm->slot; + break; + case BTRFS_MOD_LOG_KEY_REPLACE: + BUG_ON(tm->slot >= n); + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + break; + case BTRFS_MOD_LOG_KEY_ADD: + /* + * It is possible we could have already removed keys + * behind the known max slot, so this will be an + * overestimate. In practice, the copy operation + * inserts them in increasing order, and overestimating + * just means we miss some warnings, so it's OK. It + * isn't worth carefully tracking the full array of + * valid slots to check against when moving. + */ + if (tm->slot == max_slot) + max_slot--; + /* if a move operation is needed it's in the log */ + n--; + break; + case BTRFS_MOD_LOG_MOVE_KEYS: + ASSERT(tm->move.nr_items > 0); + move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1; + move_dst_end_slot = tm->slot + tm->move.nr_items - 1; + o_dst = btrfs_node_key_ptr_offset(eb, tm->slot); + o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot); + if (WARN_ON(move_src_end_slot > max_slot || + tm->move.nr_items <= 0)) { + btrfs_warn(fs_info, +"move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d", + eb->start, tm->slot, + tm->move.dst_slot, tm->move.nr_items, + tm->seq, n, max_slot); + } + memmove_extent_buffer(eb, o_dst, o_src, + tm->move.nr_items * p_size); + max_slot = move_dst_end_slot; + break; + case BTRFS_MOD_LOG_ROOT_REPLACE: + /* + * This operation is special. For roots, this must be + * handled explicitly before rewinding. + * For non-roots, this operation may exist if the node + * was a root: root A -> child B; then A gets empty and + * B is promoted to the new root. In the mod log, we'll + * have a root-replace operation for B, a tree block + * that is no root. We simply ignore that operation. + */ + break; + } + next = rb_next(&tm->node); + if (!next) + break; + tm = rb_entry(next, struct tree_mod_elem, node); + if (tm->logical != first_tm->logical) + break; + } + read_unlock(&fs_info->tree_mod_log_lock); + btrfs_set_header_nritems(eb, n); +} + +/* + * Called with eb read locked. If the buffer cannot be rewound, the same buffer + * is returned. If rewind operations happen, a fresh buffer is returned. The + * returned buffer is always read-locked. If the returned buffer is not the + * input buffer, the lock on the input buffer is released and the input buffer + * is freed (its refcount is decremented). + */ +struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct extent_buffer *eb, + u64 time_seq) +{ + struct extent_buffer *eb_rewin; + struct tree_mod_elem *tm; + + if (!time_seq) + return eb; + + if (btrfs_header_level(eb) == 0) + return eb; + + tm = tree_mod_log_search(fs_info, eb->start, time_seq); + if (!tm) + return eb; + + if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + BUG_ON(tm->slot != 0); + eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); + if (!eb_rewin) { + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + return NULL; + } + btrfs_set_header_bytenr(eb_rewin, eb->start); + btrfs_set_header_backref_rev(eb_rewin, + btrfs_header_backref_rev(eb)); + btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); + btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); + } else { + eb_rewin = btrfs_clone_extent_buffer(eb); + if (!eb_rewin) { + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + return NULL; + } + } + + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), + eb_rewin, btrfs_header_level(eb_rewin)); + btrfs_tree_read_lock(eb_rewin); + tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); + WARN_ON(btrfs_header_nritems(eb_rewin) > + BTRFS_NODEPTRS_PER_BLOCK(fs_info)); + + return eb_rewin; +} + +/* + * Rewind the state of @root's root node to the given @time_seq value. + * If there are no changes, the current root->root_node is returned. If anything + * changed in between, there's a fresh buffer allocated on which the rewind + * operations are done. In any case, the returned buffer is read locked. + * Returns NULL on error (with no locks held). + */ +struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct tree_mod_elem *tm; + struct extent_buffer *eb = NULL; + struct extent_buffer *eb_root; + u64 eb_root_owner = 0; + struct extent_buffer *old; + struct tree_mod_root *old_root = NULL; + u64 old_generation = 0; + u64 logical; + int level; + + eb_root = btrfs_read_lock_root_node(root); + tm = tree_mod_log_oldest_root(eb_root, time_seq); + if (!tm) + return eb_root; + + if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) { + old_root = &tm->old_root; + old_generation = tm->generation; + logical = old_root->logical; + level = old_root->level; + } else { + logical = eb_root->start; + level = btrfs_header_level(eb_root); + } + + tm = tree_mod_log_search(fs_info, logical, time_seq); + if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + struct btrfs_tree_parent_check check = { 0 }; + + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + + check.level = level; + check.owner_root = root->root_key.objectid; + + old = read_tree_block(fs_info, logical, &check); + if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { + if (!IS_ERR(old)) + free_extent_buffer(old); + btrfs_warn(fs_info, + "failed to read tree block %llu from get_old_root", + logical); + } else { + struct tree_mod_elem *tm2; + + btrfs_tree_read_lock(old); + eb = btrfs_clone_extent_buffer(old); + /* + * After the lookup for the most recent tree mod operation + * above and before we locked and cloned the extent buffer + * 'old', a new tree mod log operation may have been added. + * So lookup for a more recent one to make sure the number + * of mod log operations we replay is consistent with the + * number of items we have in the cloned extent buffer, + * otherwise we can hit a BUG_ON when rewinding the extent + * buffer. + */ + tm2 = tree_mod_log_search(fs_info, logical, time_seq); + btrfs_tree_read_unlock(old); + free_extent_buffer(old); + ASSERT(tm2); + ASSERT(tm2 == tm || tm2->seq > tm->seq); + if (!tm2 || tm2->seq < tm->seq) { + free_extent_buffer(eb); + return NULL; + } + tm = tm2; + } + } else if (old_root) { + eb_root_owner = btrfs_header_owner(eb_root); + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + eb = alloc_dummy_extent_buffer(fs_info, logical); + } else { + eb = btrfs_clone_extent_buffer(eb_root); + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + } + + if (!eb) + return NULL; + if (old_root) { + btrfs_set_header_bytenr(eb, eb->start); + btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(eb, eb_root_owner); + btrfs_set_header_level(eb, old_root->level); + btrfs_set_header_generation(eb, old_generation); + } + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, + btrfs_header_level(eb)); + btrfs_tree_read_lock(eb); + if (tm) + tree_mod_log_rewind(fs_info, eb, time_seq, tm); + else + WARN_ON(btrfs_header_level(eb) != 0); + WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info)); + + return eb; +} + +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + int level; + struct extent_buffer *eb_root = btrfs_root_node(root); + + tm = tree_mod_log_oldest_root(eb_root, time_seq); + if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) + level = tm->old_root.level; + else + level = btrfs_header_level(eb_root); + + free_extent_buffer(eb_root); + + return level; +} + +/* + * Return the lowest sequence number in the tree modification log. + * + * Return the sequence number of the oldest tree modification log user, which + * corresponds to the lowest sequence number of all existing users. If there are + * no users it returns 0. + */ +u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info) +{ + u64 ret = 0; + + read_lock(&fs_info->tree_mod_log_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + struct btrfs_seq_list *elem; + + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct btrfs_seq_list, list); + ret = elem->seq; + } + read_unlock(&fs_info->tree_mod_log_lock); + + return ret; +} diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h new file mode 100644 index 0000000000..94f10afeee --- /dev/null +++ b/fs/btrfs/tree-mod-log.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_TREE_MOD_LOG_H +#define BTRFS_TREE_MOD_LOG_H + +#include "ctree.h" + +/* Represents a tree mod log user. */ +struct btrfs_seq_list { + struct list_head list; + u64 seq; +}; + +#define BTRFS_SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } +#define BTRFS_SEQ_LAST ((u64)-1) + +enum btrfs_mod_log_op { + BTRFS_MOD_LOG_KEY_REPLACE, + BTRFS_MOD_LOG_KEY_ADD, + BTRFS_MOD_LOG_KEY_REMOVE, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, + BTRFS_MOD_LOG_MOVE_KEYS, + BTRFS_MOD_LOG_ROOT_REPLACE, +}; + +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct btrfs_seq_list *elem); +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct btrfs_seq_list *elem); +int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, + struct extent_buffer *new_root, + bool log_removal); +int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, + enum btrfs_mod_log_op op); +int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); +struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct extent_buffer *eb, + u64 time_seq); +struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq); +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); +int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, + struct extent_buffer *src, + unsigned long dst_offset, + unsigned long src_offset, + int nr_items); +int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, + int dst_slot, int src_slot, + int nr_items); +u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info); + +#endif diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c new file mode 100644 index 0000000000..3360602551 --- /dev/null +++ b/fs/btrfs/ulist.c @@ -0,0 +1,284 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2011 STRATO AG + * written by Arne Jansen + */ + +#include +#include "messages.h" +#include "ulist.h" +#include "ctree.h" + +/* + * ulist is a generic data structure to hold a collection of unique u64 + * values. The only operations it supports is adding to the list and + * enumerating it. + * It is possible to store an auxiliary value along with the key. + * + * A sample usage for ulists is the enumeration of directed graphs without + * visiting a node twice. The pseudo-code could look like this: + * + * ulist = ulist_alloc(); + * ulist_add(ulist, root); + * ULIST_ITER_INIT(&uiter); + * + * while ((elem = ulist_next(ulist, &uiter)) { + * for (all child nodes n in elem) + * ulist_add(ulist, n); + * do something useful with the node; + * } + * ulist_free(ulist); + * + * This assumes the graph nodes are addressable by u64. This stems from the + * usage for tree enumeration in btrfs, where the logical addresses are + * 64 bit. + * + * It is also useful for tree enumeration which could be done elegantly + * recursively, but is not possible due to kernel stack limitations. The + * loop would be similar to the above. + */ + +/* + * Freshly initialize a ulist. + * + * @ulist: the ulist to initialize + * + * Note: don't use this function to init an already used ulist, use + * ulist_reinit instead. + */ +void ulist_init(struct ulist *ulist) +{ + INIT_LIST_HEAD(&ulist->nodes); + ulist->root = RB_ROOT; + ulist->nnodes = 0; +} + +/* + * Free up additionally allocated memory for the ulist. + * + * @ulist: the ulist from which to free the additional memory + * + * This is useful in cases where the base 'struct ulist' has been statically + * allocated. + */ +void ulist_release(struct ulist *ulist) +{ + struct ulist_node *node; + struct ulist_node *next; + + list_for_each_entry_safe(node, next, &ulist->nodes, list) { + kfree(node); + } + ulist->root = RB_ROOT; + INIT_LIST_HEAD(&ulist->nodes); +} + +/* + * Prepare a ulist for reuse. + * + * @ulist: ulist to be reused + * + * Free up all additional memory allocated for the list elements and reinit + * the ulist. + */ +void ulist_reinit(struct ulist *ulist) +{ + ulist_release(ulist); + ulist_init(ulist); +} + +/* + * Dynamically allocate a ulist. + * + * @gfp_mask: allocation flags to for base allocation + * + * The allocated ulist will be returned in an initialized state. + */ +struct ulist *ulist_alloc(gfp_t gfp_mask) +{ + struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); + + if (!ulist) + return NULL; + + ulist_init(ulist); + + return ulist; +} + +/* + * Free dynamically allocated ulist. + * + * @ulist: ulist to free + * + * It is not necessary to call ulist_release before. + */ +void ulist_free(struct ulist *ulist) +{ + if (!ulist) + return; + ulist_release(ulist); + kfree(ulist); +} + +static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) +{ + struct rb_node *n = ulist->root.rb_node; + struct ulist_node *u = NULL; + + while (n) { + u = rb_entry(n, struct ulist_node, rb_node); + if (u->val < val) + n = n->rb_right; + else if (u->val > val) + n = n->rb_left; + else + return u; + } + return NULL; +} + +static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node) +{ + rb_erase(&node->rb_node, &ulist->root); + list_del(&node->list); + kfree(node); + BUG_ON(ulist->nnodes == 0); + ulist->nnodes--; +} + +static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) +{ + struct rb_node **p = &ulist->root.rb_node; + struct rb_node *parent = NULL; + struct ulist_node *cur = NULL; + + while (*p) { + parent = *p; + cur = rb_entry(parent, struct ulist_node, rb_node); + + if (cur->val < ins->val) + p = &(*p)->rb_right; + else if (cur->val > ins->val) + p = &(*p)->rb_left; + else + return -EEXIST; + } + rb_link_node(&ins->rb_node, parent, p); + rb_insert_color(&ins->rb_node, &ulist->root); + return 0; +} + +/* + * Add an element to the ulist. + * + * @ulist: ulist to add the element to + * @val: value to add to ulist + * @aux: auxiliary value to store along with val + * @gfp_mask: flags to use for allocation + * + * Note: locking must be provided by the caller. In case of rwlocks write + * locking is needed + * + * Add an element to a ulist. The @val will only be added if it doesn't + * already exist. If it is added, the auxiliary value @aux is stored along with + * it. In case @val already exists in the ulist, @aux is ignored, even if + * it differs from the already stored value. + * + * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been + * inserted. + * In case of allocation failure -ENOMEM is returned and the ulist stays + * unaltered. + */ +int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask) +{ + return ulist_add_merge(ulist, val, aux, NULL, gfp_mask); +} + +int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, + u64 *old_aux, gfp_t gfp_mask) +{ + int ret; + struct ulist_node *node; + + node = ulist_rbtree_search(ulist, val); + if (node) { + if (old_aux) + *old_aux = node->aux; + return 0; + } + node = kmalloc(sizeof(*node), gfp_mask); + if (!node) + return -ENOMEM; + + node->val = val; + node->aux = aux; + + ret = ulist_rbtree_insert(ulist, node); + ASSERT(!ret); + list_add_tail(&node->list, &ulist->nodes); + ulist->nnodes++; + + return 1; +} + +/* + * ulist_del - delete one node from ulist + * @ulist: ulist to remove node from + * @val: value to delete + * @aux: aux to delete + * + * The deletion will only be done when *BOTH* val and aux matches. + * Return 0 for successful delete. + * Return > 0 for not found. + */ +int ulist_del(struct ulist *ulist, u64 val, u64 aux) +{ + struct ulist_node *node; + + node = ulist_rbtree_search(ulist, val); + /* Not found */ + if (!node) + return 1; + + if (node->aux != aux) + return 1; + + /* Found and delete */ + ulist_rbtree_erase(ulist, node); + return 0; +} + +/* + * Iterate ulist. + * + * @ulist: ulist to iterate + * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator) + * + * Note: locking must be provided by the caller. In case of rwlocks only read + * locking is needed + * + * This function is used to iterate an ulist. + * It returns the next element from the ulist or %NULL when the + * end is reached. No guarantee is made with respect to the order in which + * the elements are returned. They might neither be returned in order of + * addition nor in ascending order. + * It is allowed to call ulist_add during an enumeration. Newly added items + * are guaranteed to show up in the running enumeration. + */ +struct ulist_node *ulist_next(const struct ulist *ulist, struct ulist_iterator *uiter) +{ + struct ulist_node *node; + + if (list_empty(&ulist->nodes)) + return NULL; + if (uiter->cur_list && uiter->cur_list->next == &ulist->nodes) + return NULL; + if (uiter->cur_list) { + uiter->cur_list = uiter->cur_list->next; + } else { + uiter->cur_list = ulist->nodes.next; + } + node = list_entry(uiter->cur_list, struct ulist_node, list); + return node; +} diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h new file mode 100644 index 0000000000..b2cef187ea --- /dev/null +++ b/fs/btrfs/ulist.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2011 STRATO AG + * written by Arne Jansen + */ + +#ifndef BTRFS_ULIST_H +#define BTRFS_ULIST_H + +#include +#include + +/* + * ulist is a generic data structure to hold a collection of unique u64 + * values. The only operations it supports is adding to the list and + * enumerating it. + * It is possible to store an auxiliary value along with the key. + * + */ +struct ulist_iterator { + struct list_head *cur_list; /* hint to start search */ +}; + +/* + * element of the list + */ +struct ulist_node { + u64 val; /* value to store */ + u64 aux; /* auxiliary value saved along with the val */ + + struct list_head list; /* used to link node */ + struct rb_node rb_node; /* used to speed up search */ +}; + +struct ulist { + /* + * number of elements stored in list + */ + unsigned long nnodes; + + struct list_head nodes; + struct rb_root root; +}; + +void ulist_init(struct ulist *ulist); +void ulist_release(struct ulist *ulist); +void ulist_reinit(struct ulist *ulist); +struct ulist *ulist_alloc(gfp_t gfp_mask); +void ulist_free(struct ulist *ulist); +int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); +int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, + u64 *old_aux, gfp_t gfp_mask); +int ulist_del(struct ulist *ulist, u64 val, u64 aux); + +/* just like ulist_add_merge() but take a pointer for the aux data */ +static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux, + void **old_aux, gfp_t gfp_mask) +{ +#if BITS_PER_LONG == 32 + u64 old64 = (uintptr_t)*old_aux; + int ret = ulist_add_merge(ulist, val, (uintptr_t)aux, &old64, gfp_mask); + *old_aux = (void *)((uintptr_t)old64); + return ret; +#else + return ulist_add_merge(ulist, val, (u64)aux, (u64 *)old_aux, gfp_mask); +#endif +} + +struct ulist_node *ulist_next(const struct ulist *ulist, + struct ulist_iterator *uiter); + +#define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL) + +#endif diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c new file mode 100644 index 0000000000..5be74f9e47 --- /dev/null +++ b/fs/btrfs/uuid-tree.c @@ -0,0 +1,393 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) STRATO AG 2013. All rights reserved. + */ + +#include +#include +#include "messages.h" +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "print-tree.h" +#include "fs.h" +#include "accessors.h" +#include "uuid-tree.h" + +static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) +{ + key->type = type; + key->objectid = get_unaligned_le64(uuid); + key->offset = get_unaligned_le64(uuid + sizeof(u64)); +} + +/* return -ENOENT for !found, < 0 for errors, or 0 if an item was found */ +static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid, + u8 type, u64 subid) +{ + int ret; + struct btrfs_path *path = NULL; + struct extent_buffer *eb; + int slot; + u32 item_size; + unsigned long offset; + struct btrfs_key key; + + if (WARN_ON_ONCE(!uuid_root)) { + ret = -ENOENT; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + btrfs_uuid_to_key(uuid, type, &key); + ret = btrfs_search_slot(NULL, uuid_root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -ENOENT; + goto out; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size(eb, slot); + offset = btrfs_item_ptr_offset(eb, slot); + ret = -ENOENT; + + if (!IS_ALIGNED(item_size, sizeof(u64))) { + btrfs_warn(uuid_root->fs_info, + "uuid item with illegal size %lu!", + (unsigned long)item_size); + goto out; + } + while (item_size) { + __le64 data; + + read_extent_buffer(eb, &data, offset, sizeof(data)); + if (le64_to_cpu(data) == subid) { + ret = 0; + break; + } + offset += sizeof(data); + item_size -= sizeof(data); + } + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, + u64 subid_cpu) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *uuid_root = fs_info->uuid_root; + int ret; + struct btrfs_path *path = NULL; + struct btrfs_key key; + struct extent_buffer *eb; + int slot; + unsigned long offset; + __le64 subid_le; + + ret = btrfs_uuid_tree_lookup(uuid_root, uuid, type, subid_cpu); + if (ret != -ENOENT) + return ret; + + if (WARN_ON_ONCE(!uuid_root)) { + ret = -EINVAL; + goto out; + } + + btrfs_uuid_to_key(uuid, type, &key); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_insert_empty_item(trans, uuid_root, path, &key, + sizeof(subid_le)); + if (ret >= 0) { + /* Add an item for the type for the first time */ + eb = path->nodes[0]; + slot = path->slots[0]; + offset = btrfs_item_ptr_offset(eb, slot); + } else if (ret == -EEXIST) { + /* + * An item with that type already exists. + * Extend the item and store the new subid at the end. + */ + btrfs_extend_item(trans, path, sizeof(subid_le)); + eb = path->nodes[0]; + slot = path->slots[0]; + offset = btrfs_item_ptr_offset(eb, slot); + offset += btrfs_item_size(eb, slot) - sizeof(subid_le); + } else { + btrfs_warn(fs_info, + "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!", + ret, key.objectid, key.offset, type); + goto out; + } + + ret = 0; + subid_le = cpu_to_le64(subid_cpu); + write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); + btrfs_mark_buffer_dirty(trans, eb); + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, + u64 subid) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *uuid_root = fs_info->uuid_root; + int ret; + struct btrfs_path *path = NULL; + struct btrfs_key key; + struct extent_buffer *eb; + int slot; + unsigned long offset; + u32 item_size; + unsigned long move_dst; + unsigned long move_src; + unsigned long move_len; + + if (WARN_ON_ONCE(!uuid_root)) { + ret = -EINVAL; + goto out; + } + + btrfs_uuid_to_key(uuid, type, &key); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1); + if (ret < 0) { + btrfs_warn(fs_info, "error %d while searching for uuid item!", + ret); + goto out; + } + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + offset = btrfs_item_ptr_offset(eb, slot); + item_size = btrfs_item_size(eb, slot); + if (!IS_ALIGNED(item_size, sizeof(u64))) { + btrfs_warn(fs_info, "uuid item with illegal size %lu!", + (unsigned long)item_size); + ret = -ENOENT; + goto out; + } + while (item_size) { + __le64 read_subid; + + read_extent_buffer(eb, &read_subid, offset, sizeof(read_subid)); + if (le64_to_cpu(read_subid) == subid) + break; + offset += sizeof(read_subid); + item_size -= sizeof(read_subid); + } + + if (!item_size) { + ret = -ENOENT; + goto out; + } + + item_size = btrfs_item_size(eb, slot); + if (item_size == sizeof(subid)) { + ret = btrfs_del_item(trans, uuid_root, path); + goto out; + } + + move_dst = offset; + move_src = offset + sizeof(subid); + move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot)); + memmove_extent_buffer(eb, move_dst, move_src, move_len); + btrfs_truncate_item(trans, path, item_size - sizeof(subid), 1); + +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid) +{ + struct btrfs_trans_handle *trans; + int ret; + + /* 1 - for the uuid item */ + trans = btrfs_start_transaction(uuid_root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + ret = btrfs_uuid_tree_remove(trans, uuid, type, subid); + btrfs_end_transaction(trans); + +out: + return ret; +} + +/* + * Check if there's an matching subvolume for given UUID + * + * Return: + * 0 check succeeded, the entry is not outdated + * > 0 if the check failed, the caller should remove the entry + * < 0 if an error occurred + */ +static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, + u8 *uuid, u8 type, u64 subvolid) +{ + int ret = 0; + struct btrfs_root *subvol_root; + + if (type != BTRFS_UUID_KEY_SUBVOL && + type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) + goto out; + + subvol_root = btrfs_get_fs_root(fs_info, subvolid, true); + if (IS_ERR(subvol_root)) { + ret = PTR_ERR(subvol_root); + if (ret == -ENOENT) + ret = 1; + goto out; + } + + switch (type) { + case BTRFS_UUID_KEY_SUBVOL: + if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) + ret = 1; + break; + case BTRFS_UUID_KEY_RECEIVED_SUBVOL: + if (memcmp(uuid, subvol_root->root_item.received_uuid, + BTRFS_UUID_SIZE)) + ret = 1; + break; + } + btrfs_put_root(subvol_root); +out: + return ret; +} + +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root = fs_info->uuid_root; + struct btrfs_key key; + struct btrfs_path *path; + int ret = 0; + struct extent_buffer *leaf; + int slot; + u32 item_size; + unsigned long offset; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = 0; + key.offset = 0; + +again_search_slot: + ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); + if (ret) { + if (ret > 0) + ret = 0; + goto out; + } + + while (1) { + if (btrfs_fs_closing(fs_info)) { + ret = -EINTR; + goto out; + } + cond_resched(); + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.type != BTRFS_UUID_KEY_SUBVOL && + key.type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) + goto skip; + + offset = btrfs_item_ptr_offset(leaf, slot); + item_size = btrfs_item_size(leaf, slot); + if (!IS_ALIGNED(item_size, sizeof(u64))) { + btrfs_warn(fs_info, + "uuid item with illegal size %lu!", + (unsigned long)item_size); + goto skip; + } + while (item_size) { + u8 uuid[BTRFS_UUID_SIZE]; + __le64 subid_le; + u64 subid_cpu; + + put_unaligned_le64(key.objectid, uuid); + put_unaligned_le64(key.offset, uuid + sizeof(u64)); + read_extent_buffer(leaf, &subid_le, offset, + sizeof(subid_le)); + subid_cpu = le64_to_cpu(subid_le); + ret = btrfs_check_uuid_tree_entry(fs_info, uuid, + key.type, subid_cpu); + if (ret < 0) + goto out; + if (ret > 0) { + btrfs_release_path(path); + ret = btrfs_uuid_iter_rem(root, uuid, key.type, + subid_cpu); + if (ret == 0) { + /* + * this might look inefficient, but the + * justification is that it is an + * exception that check_func returns 1, + * and that in the regular case only one + * entry per UUID exists. + */ + goto again_search_slot; + } + if (ret < 0 && ret != -ENOENT) + goto out; + key.offset++; + goto again_search_slot; + } + item_size -= sizeof(subid_le); + offset += sizeof(subid_le); + } + +skip: + ret = btrfs_next_item(root, path); + if (ret == 0) + continue; + else if (ret > 0) + ret = 0; + break; + } + +out: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h new file mode 100644 index 0000000000..5350c87fe2 --- /dev/null +++ b/fs/btrfs/uuid-tree.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_UUID_TREE_H +#define BTRFS_UUID_TREE_H + +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); + +#endif diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c new file mode 100644 index 0000000000..744f4f4d4c --- /dev/null +++ b/fs/btrfs/verity.c @@ -0,0 +1,811 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "messages.h" +#include "ctree.h" +#include "btrfs_inode.h" +#include "transaction.h" +#include "disk-io.h" +#include "locking.h" +#include "fs.h" +#include "accessors.h" +#include "ioctl.h" +#include "verity.h" +#include "orphan.h" + +/* + * Implementation of the interface defined in struct fsverity_operations. + * + * The main question is how and where to store the verity descriptor and the + * Merkle tree. We store both in dedicated btree items in the filesystem tree, + * together with the rest of the inode metadata. This means we'll need to do + * extra work to encrypt them once encryption is supported in btrfs, but btrfs + * has a lot of careful code around i_size and it seems better to make a new key + * type than try and adjust all of our expectations for i_size. + * + * Note that this differs from the implementation in ext4 and f2fs, where + * this data is stored as if it were in the file, but past EOF. However, btrfs + * does not have a widespread mechanism for caching opaque metadata pages, so we + * do pretend that the Merkle tree pages themselves are past EOF for the + * purposes of caching them (as opposed to creating a virtual inode). + * + * fs verity items are stored under two different key types on disk. + * The descriptor items: + * [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ] + * + * At offset 0, we store a btrfs_verity_descriptor_item which tracks the + * size of the descriptor item and some extra data for encryption. + * Starting at offset 1, these hold the generic fs verity descriptor. + * The latter are opaque to btrfs, we just read and write them as a blob for + * the higher level verity code. The most common descriptor size is 256 bytes. + * + * The merkle tree items: + * [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ] + * + * These also start at offset 0, and correspond to the merkle tree bytes. + * So when fsverity asks for page 0 of the merkle tree, we pull up one page + * starting at offset 0 for this key type. These are also opaque to btrfs, + * we're blindly storing whatever fsverity sends down. + * + * Another important consideration is the fact that the Merkle tree data scales + * linearly with the size of the file (with 4K pages/blocks and SHA-256, it's + * ~1/127th the size) so for large files, writing the tree can be a lengthy + * operation. For that reason, we guard the whole enable verity operation + * (between begin_enable_verity and end_enable_verity) with an orphan item. + * Again, because the data can be pretty large, it's quite possible that we + * could run out of space writing it, so we try our best to handle errors by + * stopping and rolling back rather than aborting the victim transaction. + */ + +#define MERKLE_START_ALIGN 65536 + +/* + * Compute the logical file offset where we cache the Merkle tree. + * + * @inode: inode of the verity file + * + * For the purposes of caching the Merkle tree pages, as required by + * fs-verity, it is convenient to do size computations in terms of a file + * offset, rather than in terms of page indices. + * + * Use 64K to be sure it's past the last page in the file, even with 64K pages. + * That rounding operation itself can overflow loff_t, so we do it in u64 and + * check. + * + * Returns the file offset on success, negative error code on failure. + */ +static loff_t merkle_file_pos(const struct inode *inode) +{ + u64 sz = inode->i_size; + u64 rounded = round_up(sz, MERKLE_START_ALIGN); + + if (rounded > inode->i_sb->s_maxbytes) + return -EFBIG; + + return rounded; +} + +/* + * Drop all the items for this inode with this key_type. + * + * @inode: inode to drop items for + * @key_type: type of items to drop (BTRFS_VERITY_DESC_ITEM or + * BTRFS_VERITY_MERKLE_ITEM) + * + * Before doing a verity enable we cleanup any existing verity items. + * This is also used to clean up if a verity enable failed half way through. + * + * Returns number of dropped items on success, negative error code on failure. + */ +static int drop_verity_items(struct btrfs_inode *inode, u8 key_type) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = inode->root; + struct btrfs_path *path; + struct btrfs_key key; + int count = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + /* 1 for the item being dropped */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + /* + * Walk backwards through all the items until we find one that + * isn't from our key type or objectid + */ + key.objectid = btrfs_ino(inode); + key.type = key_type; + key.offset = (u64)-1; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = 0; + /* No more keys of this type, we're done */ + if (path->slots[0] == 0) + break; + path->slots[0]--; + } else if (ret < 0) { + btrfs_end_transaction(trans); + goto out; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + /* No more keys of this type, we're done */ + if (key.objectid != btrfs_ino(inode) || key.type != key_type) + break; + + /* + * This shouldn't be a performance sensitive function because + * it's not used as part of truncate. If it ever becomes + * perf sensitive, change this to walk forward and bulk delete + * items + */ + ret = btrfs_del_items(trans, root, path, path->slots[0], 1); + if (ret) { + btrfs_end_transaction(trans); + goto out; + } + count++; + btrfs_release_path(path); + btrfs_end_transaction(trans); + } + ret = count; + btrfs_end_transaction(trans); +out: + btrfs_free_path(path); + return ret; +} + +/* + * Drop all verity items + * + * @inode: inode to drop verity items for + * + * In most contexts where we are dropping verity items, we want to do it for all + * the types of verity items, not a particular one. + * + * Returns: 0 on success, negative error code on failure. + */ +int btrfs_drop_verity_items(struct btrfs_inode *inode) +{ + int ret; + + ret = drop_verity_items(inode, BTRFS_VERITY_DESC_ITEM_KEY); + if (ret < 0) + return ret; + ret = drop_verity_items(inode, BTRFS_VERITY_MERKLE_ITEM_KEY); + if (ret < 0) + return ret; + + return 0; +} + +/* + * Insert and write inode items with a given key type and offset. + * + * @inode: inode to insert for + * @key_type: key type to insert + * @offset: item offset to insert at + * @src: source data to write + * @len: length of source data to write + * + * Write len bytes from src into items of up to 2K length. + * The inserted items will have key (ino, key_type, offset + off) where off is + * consecutively increasing from 0 up to the last item ending at offset + len. + * + * Returns 0 on success and a negative error code on failure. + */ +static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, + const char *src, u64 len) +{ + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long copy_bytes; + unsigned long src_offset = 0; + void *data; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (len > 0) { + /* 1 for the new item being inserted */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + + key.objectid = btrfs_ino(inode); + key.type = key_type; + key.offset = offset; + + /* + * Insert 2K at a time mostly to be friendly for smaller leaf + * size filesystems + */ + copy_bytes = min_t(u64, len, 2048); + + ret = btrfs_insert_empty_item(trans, root, path, &key, copy_bytes); + if (ret) { + btrfs_end_transaction(trans); + break; + } + + leaf = path->nodes[0]; + + data = btrfs_item_ptr(leaf, path->slots[0], void); + write_extent_buffer(leaf, src + src_offset, + (unsigned long)data, copy_bytes); + offset += copy_bytes; + src_offset += copy_bytes; + len -= copy_bytes; + + btrfs_release_path(path); + btrfs_end_transaction(trans); + } + + btrfs_free_path(path); + return ret; +} + +/* + * Read inode items of the given key type and offset from the btree. + * + * @inode: inode to read items of + * @key_type: key type to read + * @offset: item offset to read from + * @dest: Buffer to read into. This parameter has slightly tricky + * semantics. If it is NULL, the function will not do any copying + * and will just return the size of all the items up to len bytes. + * If dest_page is passed, then the function will kmap_local the + * page and ignore dest, but it must still be non-NULL to avoid the + * counting-only behavior. + * @len: length in bytes to read + * @dest_page: copy into this page instead of the dest buffer + * + * Helper function to read items from the btree. This returns the number of + * bytes read or < 0 for errors. We can return short reads if the items don't + * exist on disk or aren't big enough to fill the desired length. Supports + * reading into a provided buffer (dest) or into the page cache + * + * Returns number of bytes read or a negative error code on failure. + */ +static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, + char *dest, u64 len, struct page *dest_page) +{ + struct btrfs_path *path; + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf; + struct btrfs_key key; + u64 item_end; + u64 copy_end; + int copied = 0; + u32 copy_offset; + unsigned long copy_bytes; + unsigned long dest_offset = 0; + void *data; + char *kaddr = dest; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (dest_page) + path->reada = READA_FORWARD; + + key.objectid = btrfs_ino(inode); + key.type = key_type; + key.offset = offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + + while (len > 0) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != btrfs_ino(inode) || key.type != key_type) + break; + + item_end = btrfs_item_size(leaf, path->slots[0]) + key.offset; + + if (copied > 0) { + /* + * Once we've copied something, we want all of the items + * to be sequential + */ + if (key.offset != offset) + break; + } else { + /* + * Our initial offset might be in the middle of an + * item. Make sure it all makes sense. + */ + if (key.offset > offset) + break; + if (item_end <= offset) + break; + } + + /* desc = NULL to just sum all the item lengths */ + if (!dest) + copy_end = item_end; + else + copy_end = min(offset + len, item_end); + + /* Number of bytes in this item we want to copy */ + copy_bytes = copy_end - offset; + + /* Offset from the start of item for copying */ + copy_offset = offset - key.offset; + + if (dest) { + if (dest_page) + kaddr = kmap_local_page(dest_page); + + data = btrfs_item_ptr(leaf, path->slots[0], void); + read_extent_buffer(leaf, kaddr + dest_offset, + (unsigned long)data + copy_offset, + copy_bytes); + + if (dest_page) + kunmap_local(kaddr); + } + + offset += copy_bytes; + dest_offset += copy_bytes; + len -= copy_bytes; + copied += copy_bytes; + + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + /* + * We've reached the last slot in this leaf and we need + * to go to the next leaf. + */ + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + break; + } else if (ret > 0) { + ret = 0; + break; + } + } + } +out: + btrfs_free_path(path); + if (!ret) + ret = copied; + return ret; +} + +/* + * Delete an fsverity orphan + * + * @trans: transaction to do the delete in + * @inode: inode to orphan + * + * Capture verity orphan specific logic that is repeated in the couple places + * we delete verity orphans. Specifically, handling ENOENT and ignoring inodes + * with 0 links. + * + * Returns zero on success or a negative error code on failure. + */ +static int del_orphan(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + int ret; + + /* + * If the inode has no links, it is either already unlinked, or was + * created with O_TMPFILE. In either case, it should have an orphan from + * that other operation. Rather than reference count the orphans, we + * simply ignore them here, because we only invoke the verity path in + * the orphan logic when i_nlink is 1. + */ + if (!inode->vfs_inode.i_nlink) + return 0; + + ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); + if (ret == -ENOENT) + ret = 0; + return ret; +} + +/* + * Rollback in-progress verity if we encounter an error. + * + * @inode: inode verity had an error for + * + * We try to handle recoverable errors while enabling verity by rolling it back + * and just failing the operation, rather than having an fs level error no + * matter what. However, any error in rollback is unrecoverable. + * + * Returns 0 on success, negative error code on failure. + */ +static int rollback_verity(struct btrfs_inode *inode) +{ + struct btrfs_trans_handle *trans = NULL; + struct btrfs_root *root = inode->root; + int ret; + + ASSERT(inode_is_locked(&inode->vfs_inode)); + truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size); + clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags); + ret = btrfs_drop_verity_items(inode); + if (ret) { + btrfs_handle_fs_error(root->fs_info, ret, + "failed to drop verity items in rollback %llu", + (u64)inode->vfs_inode.i_ino); + goto out; + } + + /* + * 1 for updating the inode flag + * 1 for deleting the orphan + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + btrfs_handle_fs_error(root->fs_info, ret, + "failed to start transaction in verity rollback %llu", + (u64)inode->vfs_inode.i_ino); + goto out; + } + inode->ro_flags &= ~BTRFS_INODE_RO_VERITY; + btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + ret = del_orphan(trans, inode); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } +out: + if (trans) + btrfs_end_transaction(trans); + return ret; +} + +/* + * Finalize making the file a valid verity file + * + * @inode: inode to be marked as verity + * @desc: contents of the verity descriptor to write (not NULL) + * @desc_size: size of the verity descriptor + * + * Do the actual work of finalizing verity after successfully writing the Merkle + * tree: + * + * - write out the descriptor items + * - mark the inode with the verity flag + * - delete the orphan item + * - mark the ro compat bit + * - clear the in progress bit + * + * Returns 0 on success, negative error code on failure. + */ +static int finish_verity(struct btrfs_inode *inode, const void *desc, + size_t desc_size) +{ + struct btrfs_trans_handle *trans = NULL; + struct btrfs_root *root = inode->root; + struct btrfs_verity_descriptor_item item; + int ret; + + /* Write out the descriptor item */ + memset(&item, 0, sizeof(item)); + btrfs_set_stack_verity_descriptor_size(&item, desc_size); + ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 0, + (const char *)&item, sizeof(item)); + if (ret) + goto out; + + /* Write out the descriptor itself */ + ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 1, + desc, desc_size); + if (ret) + goto out; + + /* + * 1 for updating the inode flag + * 1 for deleting the orphan + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + inode->ro_flags |= BTRFS_INODE_RO_VERITY; + btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + ret = btrfs_update_inode(trans, root, inode); + if (ret) + goto end_trans; + ret = del_orphan(trans, inode); + if (ret) + goto end_trans; + clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags); + btrfs_set_fs_compat_ro(root->fs_info, VERITY); +end_trans: + btrfs_end_transaction(trans); +out: + return ret; + +} + +/* + * fsverity op that begins enabling verity. + * + * @filp: file to enable verity on + * + * Begin enabling fsverity for the file. We drop any existing verity items, add + * an orphan and set the in progress bit. + * + * Returns 0 on success, negative error code on failure. + */ +static int btrfs_begin_enable_verity(struct file *filp) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(filp)); + struct btrfs_root *root = inode->root; + struct btrfs_trans_handle *trans; + int ret; + + ASSERT(inode_is_locked(file_inode(filp))); + + if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags)) + return -EBUSY; + + /* + * This should almost never do anything, but theoretically, it's + * possible that we failed to enable verity on a file, then were + * interrupted or failed while rolling back, failed to cleanup the + * orphan, and finally attempt to enable verity again. + */ + ret = btrfs_drop_verity_items(inode); + if (ret) + return ret; + + /* 1 for the orphan item */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_orphan_add(trans, inode); + if (!ret) + set_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags); + btrfs_end_transaction(trans); + + return 0; +} + +/* + * fsverity op that ends enabling verity. + * + * @filp: file we are finishing enabling verity on + * @desc: verity descriptor to write out (NULL in error conditions) + * @desc_size: size of the verity descriptor (variable with signatures) + * @merkle_tree_size: size of the merkle tree in bytes + * + * If desc is null, then VFS is signaling an error occurred during verity + * enable, and we should try to rollback. Otherwise, attempt to finish verity. + * + * Returns 0 on success, negative error code on error. + */ +static int btrfs_end_enable_verity(struct file *filp, const void *desc, + size_t desc_size, u64 merkle_tree_size) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(filp)); + int ret = 0; + int rollback_ret; + + ASSERT(inode_is_locked(file_inode(filp))); + + if (desc == NULL) + goto rollback; + + ret = finish_verity(inode, desc, desc_size); + if (ret) + goto rollback; + return ret; + +rollback: + rollback_ret = rollback_verity(inode); + if (rollback_ret) + btrfs_err(inode->root->fs_info, + "failed to rollback verity items: %d", rollback_ret); + return ret; +} + +/* + * fsverity op that gets the struct fsverity_descriptor. + * + * @inode: inode to get the descriptor of + * @buf: output buffer for the descriptor contents + * @buf_size: size of the output buffer. 0 to query the size + * + * fsverity does a two pass setup for reading the descriptor, in the first pass + * it calls with buf_size = 0 to query the size of the descriptor, and then in + * the second pass it actually reads the descriptor off disk. + * + * Returns the size on success or a negative error code on failure. + */ +int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size) +{ + u64 true_size; + int ret = 0; + struct btrfs_verity_descriptor_item item; + + memset(&item, 0, sizeof(item)); + ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 0, + (char *)&item, sizeof(item), NULL); + if (ret < 0) + return ret; + + if (item.reserved[0] != 0 || item.reserved[1] != 0) + return -EUCLEAN; + + true_size = btrfs_stack_verity_descriptor_size(&item); + if (true_size > INT_MAX) + return -EUCLEAN; + + if (buf_size == 0) + return true_size; + if (buf_size < true_size) + return -ERANGE; + + ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 1, + buf, buf_size, NULL); + if (ret < 0) + return ret; + if (ret != true_size) + return -EIO; + + return true_size; +} + +/* + * fsverity op that reads and caches a merkle tree page. + * + * @inode: inode to read a merkle tree page for + * @index: page index relative to the start of the merkle tree + * @num_ra_pages: number of pages to readahead. Optional, we ignore it + * + * The Merkle tree is stored in the filesystem btree, but its pages are cached + * with a logical position past EOF in the inode's mapping. + * + * Returns the page we read, or an ERR_PTR on error. + */ +static struct page *btrfs_read_merkle_tree_page(struct inode *inode, + pgoff_t index, + unsigned long num_ra_pages) +{ + struct folio *folio; + u64 off = (u64)index << PAGE_SHIFT; + loff_t merkle_pos = merkle_file_pos(inode); + int ret; + + if (merkle_pos < 0) + return ERR_PTR(merkle_pos); + if (merkle_pos > inode->i_sb->s_maxbytes - off - PAGE_SIZE) + return ERR_PTR(-EFBIG); + index += merkle_pos >> PAGE_SHIFT; +again: + folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0); + if (!IS_ERR(folio)) { + if (folio_test_uptodate(folio)) + goto out; + + folio_lock(folio); + /* If it's not uptodate after we have the lock, we got a read error. */ + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(-EIO); + } + folio_unlock(folio); + goto out; + } + + folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS), + 0); + if (!folio) + return ERR_PTR(-ENOMEM); + + ret = filemap_add_folio(inode->i_mapping, folio, index, GFP_NOFS); + if (ret) { + folio_put(folio); + /* Did someone else insert a folio here? */ + if (ret == -EEXIST) + goto again; + return ERR_PTR(ret); + } + + /* + * Merkle item keys are indexed from byte 0 in the merkle tree. + * They have the form: + * + * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ] + */ + ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off, + folio_address(folio), PAGE_SIZE, &folio->page); + if (ret < 0) { + folio_put(folio); + return ERR_PTR(ret); + } + if (ret < PAGE_SIZE) + folio_zero_segment(folio, ret, PAGE_SIZE); + + folio_mark_uptodate(folio); + folio_unlock(folio); + +out: + return folio_file_page(folio, index); +} + +/* + * fsverity op that writes a Merkle tree block into the btree. + * + * @inode: inode to write a Merkle tree block for + * @buf: Merkle tree block to write + * @pos: the position of the block in the Merkle tree (in bytes) + * @size: the Merkle tree block size (in bytes) + * + * Returns 0 on success or negative error code on failure + */ +static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf, + u64 pos, unsigned int size) +{ + loff_t merkle_pos = merkle_file_pos(inode); + + if (merkle_pos < 0) + return merkle_pos; + if (merkle_pos > inode->i_sb->s_maxbytes - pos - size) + return -EFBIG; + + return write_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, + pos, buf, size); +} + +const struct fsverity_operations btrfs_verityops = { + .begin_enable_verity = btrfs_begin_enable_verity, + .end_enable_verity = btrfs_end_enable_verity, + .get_verity_descriptor = btrfs_get_verity_descriptor, + .read_merkle_tree_page = btrfs_read_merkle_tree_page, + .write_merkle_tree_block = btrfs_write_merkle_tree_block, +}; diff --git a/fs/btrfs/verity.h b/fs/btrfs/verity.h new file mode 100644 index 0000000000..91c10f7d0a --- /dev/null +++ b/fs/btrfs/verity.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_VERITY_H +#define BTRFS_VERITY_H + +#ifdef CONFIG_FS_VERITY + +extern const struct fsverity_operations btrfs_verityops; + +int btrfs_drop_verity_items(struct btrfs_inode *inode); +int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size); + +#else + +static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) +{ + return 0; +} + +static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf, + size_t buf_size) +{ + return -EPERM; +} + +#endif + +#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c new file mode 100644 index 0000000000..722a1dde75 --- /dev/null +++ b/fs/btrfs/volumes.c @@ -0,0 +1,8104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "misc.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "raid56.h" +#include "rcu-string.h" +#include "dev-replace.h" +#include "sysfs.h" +#include "tree-checker.h" +#include "space-info.h" +#include "block-group.h" +#include "discard.h" +#include "zoned.h" +#include "fs.h" +#include "accessors.h" +#include "uuid-tree.h" +#include "ioctl.h" +#include "relocation.h" +#include "scrub.h" +#include "super.h" + +#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ + BTRFS_BLOCK_GROUP_RAID10 | \ + BTRFS_BLOCK_GROUP_RAID56_MASK) + +const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = { + .sub_stripes = 2, + .dev_stripes = 1, + .devs_max = 0, /* 0 == as many as possible */ + .devs_min = 2, + .tolerated_failures = 1, + .devs_increment = 2, + .ncopies = 2, + .nparity = 0, + .raid_name = "raid10", + .bg_flag = BTRFS_BLOCK_GROUP_RAID10, + .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, + }, + [BTRFS_RAID_RAID1] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 2, + .devs_min = 2, + .tolerated_failures = 1, + .devs_increment = 2, + .ncopies = 2, + .nparity = 0, + .raid_name = "raid1", + .bg_flag = BTRFS_BLOCK_GROUP_RAID1, + .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, + }, + [BTRFS_RAID_RAID1C3] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 3, + .devs_min = 3, + .tolerated_failures = 2, + .devs_increment = 3, + .ncopies = 3, + .nparity = 0, + .raid_name = "raid1c3", + .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, + .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, + }, + [BTRFS_RAID_RAID1C4] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 4, + .devs_min = 4, + .tolerated_failures = 3, + .devs_increment = 4, + .ncopies = 4, + .nparity = 0, + .raid_name = "raid1c4", + .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, + .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, + }, + [BTRFS_RAID_DUP] = { + .sub_stripes = 1, + .dev_stripes = 2, + .devs_max = 1, + .devs_min = 1, + .tolerated_failures = 0, + .devs_increment = 1, + .ncopies = 2, + .nparity = 0, + .raid_name = "dup", + .bg_flag = BTRFS_BLOCK_GROUP_DUP, + .mindev_error = 0, + }, + [BTRFS_RAID_RAID0] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 1, + .tolerated_failures = 0, + .devs_increment = 1, + .ncopies = 1, + .nparity = 0, + .raid_name = "raid0", + .bg_flag = BTRFS_BLOCK_GROUP_RAID0, + .mindev_error = 0, + }, + [BTRFS_RAID_SINGLE] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 1, + .devs_min = 1, + .tolerated_failures = 0, + .devs_increment = 1, + .ncopies = 1, + .nparity = 0, + .raid_name = "single", + .bg_flag = 0, + .mindev_error = 0, + }, + [BTRFS_RAID_RAID5] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .tolerated_failures = 1, + .devs_increment = 1, + .ncopies = 1, + .nparity = 1, + .raid_name = "raid5", + .bg_flag = BTRFS_BLOCK_GROUP_RAID5, + .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, + }, + [BTRFS_RAID_RAID6] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 3, + .tolerated_failures = 2, + .devs_increment = 1, + .ncopies = 1, + .nparity = 2, + .raid_name = "raid6", + .bg_flag = BTRFS_BLOCK_GROUP_RAID6, + .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, + }, +}; + +/* + * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which + * can be used as index to access btrfs_raid_array[]. + */ +enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) +{ + const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK); + + if (!profile) + return BTRFS_RAID_SINGLE; + + return BTRFS_BG_FLAG_TO_INDEX(profile); +} + +const char *btrfs_bg_type_to_raid_name(u64 flags) +{ + const int index = btrfs_bg_flags_to_raid_index(flags); + + if (index >= BTRFS_NR_RAID_TYPES) + return NULL; + + return btrfs_raid_array[index].raid_name; +} + +int btrfs_nr_parity_stripes(u64 type) +{ + enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type); + + return btrfs_raid_array[index].nparity; +} + +/* + * Fill @buf with textual description of @bg_flags, no more than @size_buf + * bytes including terminating null byte. + */ +void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) +{ + int i; + int ret; + char *bp = buf; + u64 flags = bg_flags; + u32 size_bp = size_buf; + + if (!flags) { + strcpy(bp, "NONE"); + return; + } + +#define DESCRIBE_FLAG(flag, desc) \ + do { \ + if (flags & (flag)) { \ + ret = snprintf(bp, size_bp, "%s|", (desc)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + flags &= ~(flag); \ + } \ + } while (0) + + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); + + DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, + btrfs_raid_array[i].raid_name); +#undef DESCRIBE_FLAG + + if (flags) { + ret = snprintf(bp, size_bp, "0x%llx|", flags); + size_bp -= ret; + } + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ + + /* + * The text is trimmed, it's up to the caller to provide sufficiently + * large buffer + */ +out_overflow:; +} + +static int init_first_rw_device(struct btrfs_trans_handle *trans); +static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); +static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); + +/* + * Device locking + * ============== + * + * There are several mutexes that protect manipulation of devices and low-level + * structures like chunks but not block groups, extents or files + * + * uuid_mutex (global lock) + * ------------------------ + * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from + * the SCAN_DEV ioctl registration or from mount either implicitly (the first + * device) or requested by the device= mount option + * + * the mutex can be very coarse and can cover long-running operations + * + * protects: updates to fs_devices counters like missing devices, rw devices, + * seeding, structure cloning, opening/closing devices at mount/umount time + * + * global::fs_devs - add, remove, updates to the global list + * + * does not protect: manipulation of the fs_devices::devices list in general + * but in mount context it could be used to exclude list modifications by eg. + * scan ioctl + * + * btrfs_device::name - renames (write side), read is RCU + * + * fs_devices::device_list_mutex (per-fs, with RCU) + * ------------------------------------------------ + * protects updates to fs_devices::devices, ie. adding and deleting + * + * simple list traversal with read-only actions can be done with RCU protection + * + * may be used to exclude some operations from running concurrently without any + * modifications to the list (see write_all_supers) + * + * Is not required at mount and close times, because our device list is + * protected by the uuid_mutex at that point. + * + * balance_mutex + * ------------- + * protects balance structures (status, state) and context accessed from + * several places (internally, ioctl) + * + * chunk_mutex + * ----------- + * protects chunks, adding or removing during allocation, trim or when a new + * device is added/removed. Additionally it also protects post_commit_list of + * individual devices, since they can be added to the transaction's + * post_commit_list only with chunk_mutex held. + * + * cleaner_mutex + * ------------- + * a big lock that is held by the cleaner thread and prevents running subvolume + * cleaning together with relocation or delayed iputs + * + * + * Lock nesting + * ============ + * + * uuid_mutex + * device_list_mutex + * chunk_mutex + * balance_mutex + * + * + * Exclusive operations + * ==================== + * + * Maintains the exclusivity of the following operations that apply to the + * whole filesystem and cannot run in parallel. + * + * - Balance (*) + * - Device add + * - Device remove + * - Device replace (*) + * - Resize + * + * The device operations (as above) can be in one of the following states: + * + * - Running state + * - Paused state + * - Completed state + * + * Only device operations marked with (*) can go into the Paused state for the + * following reasons: + * + * - ioctl (only Balance can be Paused through ioctl) + * - filesystem remounted as read-only + * - filesystem unmounted and mounted as read-only + * - system power-cycle and filesystem mounted as read-only + * - filesystem or device errors leading to forced read-only + * + * The status of exclusive operation is set and cleared atomically. + * During the course of Paused state, fs_info::exclusive_operation remains set. + * A device operation in Paused or Running state can be canceled or resumed + * either by ioctl (Balance only) or when remounted as read-write. + * The exclusive status is cleared when the device operation is canceled or + * completed. + */ + +DEFINE_MUTEX(uuid_mutex); +static LIST_HEAD(fs_uuids); +struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) +{ + return &fs_uuids; +} + +/* + * alloc_fs_devices - allocate struct btrfs_fs_devices + * @fsid: if not NULL, copy the UUID to fs_devices::fsid + * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid + * + * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). + * The returned struct is not linked onto any lists and can be destroyed with + * kfree() right away. + */ +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, + const u8 *metadata_fsid) +{ + struct btrfs_fs_devices *fs_devs; + + ASSERT(fsid || !metadata_fsid); + + fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); + if (!fs_devs) + return ERR_PTR(-ENOMEM); + + mutex_init(&fs_devs->device_list_mutex); + + INIT_LIST_HEAD(&fs_devs->devices); + INIT_LIST_HEAD(&fs_devs->alloc_list); + INIT_LIST_HEAD(&fs_devs->fs_list); + INIT_LIST_HEAD(&fs_devs->seed_list); + + if (fsid) { + memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); + memcpy(fs_devs->metadata_uuid, + metadata_fsid ?: fsid, BTRFS_FSID_SIZE); + } + + return fs_devs; +} + +static void btrfs_free_device(struct btrfs_device *device) +{ + WARN_ON(!list_empty(&device->post_commit_list)); + rcu_string_free(device->name); + extent_io_tree_release(&device->alloc_state); + btrfs_destroy_dev_zone_info(device); + kfree(device); +} + +static void free_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + + WARN_ON(fs_devices->opened); + while (!list_empty(&fs_devices->devices)) { + device = list_entry(fs_devices->devices.next, + struct btrfs_device, dev_list); + list_del(&device->dev_list); + btrfs_free_device(device); + } + kfree(fs_devices); +} + +void __exit btrfs_cleanup_fs_uuids(void) +{ + struct btrfs_fs_devices *fs_devices; + + while (!list_empty(&fs_uuids)) { + fs_devices = list_entry(fs_uuids.next, + struct btrfs_fs_devices, fs_list); + list_del(&fs_devices->fs_list); + free_fs_devices(fs_devices); + } +} + +static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices, + const u8 *fsid, const u8 *metadata_fsid) +{ + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0) + return false; + + if (!metadata_fsid) + return true; + + if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0) + return false; + + return true; +} + +static noinline struct btrfs_fs_devices *find_fsid( + const u8 *fsid, const u8 *metadata_fsid) +{ + struct btrfs_fs_devices *fs_devices; + + ASSERT(fsid); + + /* Handle non-split brain cases */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid)) + return fs_devices; + } + return NULL; +} + +/* + * First check if the metadata_uuid is different from the fsid in the given + * fs_devices. Then check if the given fsid is the same as the metadata_uuid + * in the fs_devices. If it is, return true; otherwise, return false. + */ +static inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices, + const u8 *fsid) +{ + return memcmp(fs_devices->fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) != 0 && + memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0; +} + +static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( + struct btrfs_super_block *disk_super) +{ + + struct btrfs_fs_devices *fs_devices; + + /* + * Handle scanned device having completed its fsid change but + * belonging to a fs_devices that was created by first scanning + * a device which didn't have its fsid/metadata_uuid changed + * at all and the CHANGING_FSID_V2 flag set. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (!fs_devices->fsid_change) + continue; + + if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid, + fs_devices->fsid)) + return fs_devices; + } + + /* + * Handle scanned device having completed its fsid change but + * belonging to a fs_devices that was created by a device that + * has an outdated pair of fsid/metadata_uuid and + * CHANGING_FSID_V2 flag set. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (!fs_devices->fsid_change) + continue; + + if (check_fsid_changed(fs_devices, disk_super->metadata_uuid)) + return fs_devices; + } + + return find_fsid(disk_super->fsid, disk_super->metadata_uuid); +} + + +static int +btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, + int flush, struct block_device **bdev, + struct btrfs_super_block **disk_super) +{ + int ret; + + *bdev = blkdev_get_by_path(device_path, flags, holder, NULL); + + if (IS_ERR(*bdev)) { + ret = PTR_ERR(*bdev); + goto error; + } + + if (flush) + sync_blockdev(*bdev); + ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); + if (ret) { + blkdev_put(*bdev, holder); + goto error; + } + invalidate_bdev(*bdev); + *disk_super = btrfs_read_dev_super(*bdev); + if (IS_ERR(*disk_super)) { + ret = PTR_ERR(*disk_super); + blkdev_put(*bdev, holder); + goto error; + } + + return 0; + +error: + *bdev = NULL; + return ret; +} + +/* + * Search and remove all stale devices (which are not mounted). When both + * inputs are NULL, it will search and release all stale devices. + * + * @devt: Optional. When provided will it release all unmounted devices + * matching this devt only. + * @skip_device: Optional. Will skip this device when searching for the stale + * devices. + * + * Return: 0 for success or if @devt is 0. + * -EBUSY if @devt is a mounted device. + * -ENOENT if @devt does not match any device in the list. + */ +static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device) +{ + struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; + struct btrfs_device *device, *tmp_device; + int ret = 0; + + lockdep_assert_held(&uuid_mutex); + + if (devt) + ret = -ENOENT; + + list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry_safe(device, tmp_device, + &fs_devices->devices, dev_list) { + if (skip_device && skip_device == device) + continue; + if (devt && devt != device->devt) + continue; + if (fs_devices->opened) { + /* for an already deleted device return 0 */ + if (devt && ret != 0) + ret = -EBUSY; + break; + } + + /* delete the stale device */ + fs_devices->num_devices--; + list_del(&device->dev_list); + btrfs_free_device(device); + + ret = 0; + } + mutex_unlock(&fs_devices->device_list_mutex); + + if (fs_devices->num_devices == 0) { + btrfs_sysfs_remove_fsid(fs_devices); + list_del(&fs_devices->fs_list); + free_fs_devices(fs_devices); + } + } + + return ret; +} + +/* + * This is only used on mount, and we are protected from competing things + * messing with our fs_devices by the uuid_mutex, thus we do not need the + * fs_devices->device_list_mutex here. + */ +static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *device, blk_mode_t flags, + void *holder) +{ + struct block_device *bdev; + struct btrfs_super_block *disk_super; + u64 devid; + int ret; + + if (device->bdev) + return -EINVAL; + if (!device->name) + return -EINVAL; + + ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, + &bdev, &disk_super); + if (ret) + return ret; + + devid = btrfs_stack_device_id(&disk_super->dev_item); + if (devid != device->devid) + goto error_free_page; + + if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) + goto error_free_page; + + device->generation = btrfs_super_generation(disk_super); + + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { + if (btrfs_super_incompat_flags(disk_super) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { + pr_err( + "BTRFS: Invalid seeding and uuid-changed device detected\n"); + goto error_free_page; + } + + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + fs_devices->seeding = true; + } else { + if (bdev_read_only(bdev)) + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + else + set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + } + + if (!bdev_nonrot(bdev)) + fs_devices->rotating = true; + + if (bdev_max_discard_sectors(bdev)) + fs_devices->discardable = true; + + device->bdev = bdev; + clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + device->holder = holder; + + fs_devices->open_devices++; + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && + device->devid != BTRFS_DEV_REPLACE_DEVID) { + fs_devices->rw_devices++; + list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); + } + btrfs_release_disk_super(disk_super); + + return 0; + +error_free_page: + btrfs_release_disk_super(disk_super); + blkdev_put(bdev, holder); + + return -EINVAL; +} + +u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb) +{ + bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); + + return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; +} + +/* + * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices + * being created with a disk that has already completed its fsid change. Such + * disk can belong to an fs which has its FSID changed or to one which doesn't. + * Handle both cases here. + */ +static struct btrfs_fs_devices *find_fsid_inprogress( + struct btrfs_super_block *disk_super) +{ + struct btrfs_fs_devices *fs_devices; + + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (fs_devices->fsid_change) + continue; + + if (check_fsid_changed(fs_devices, disk_super->fsid)) + return fs_devices; + } + + return find_fsid(disk_super->fsid, NULL); +} + +static struct btrfs_fs_devices *find_fsid_changed( + struct btrfs_super_block *disk_super) +{ + struct btrfs_fs_devices *fs_devices; + + /* + * Handles the case where scanned device is part of an fs that had + * multiple successful changes of FSID but currently device didn't + * observe it. Meaning our fsid will be different than theirs. We need + * to handle two subcases : + * 1 - The fs still continues to have different METADATA/FSID uuids. + * 2 - The fs is switched back to its original FSID (METADATA/FSID + * are equal). + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + /* Changed UUIDs */ + if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) && + memcmp(fs_devices->fsid, disk_super->fsid, + BTRFS_FSID_SIZE) != 0) + return fs_devices; + + /* Unchanged UUIDs */ + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, + BTRFS_FSID_SIZE) == 0 && + memcmp(fs_devices->fsid, disk_super->metadata_uuid, + BTRFS_FSID_SIZE) == 0) + return fs_devices; + } + + return NULL; +} + +static struct btrfs_fs_devices *find_fsid_reverted_metadata( + struct btrfs_super_block *disk_super) +{ + struct btrfs_fs_devices *fs_devices; + + /* + * Handle the case where the scanned device is part of an fs whose last + * metadata UUID change reverted it to the original FSID. At the same + * time fs_devices was first created by another constituent device + * which didn't fully observe the operation. This results in an + * btrfs_fs_devices created with metadata/fsid different AND + * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the + * fs_devices equal to the FSID of the disk. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (!fs_devices->fsid_change) + continue; + + if (check_fsid_changed(fs_devices, disk_super->fsid)) + return fs_devices; + } + + return NULL; +} +/* + * Add new device to list of registered devices + * + * Returns: + * device pointer which was just added or updated when successful + * error pointer when failed + */ +static noinline struct btrfs_device *device_list_add(const char *path, + struct btrfs_super_block *disk_super, + bool *new_device_added) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = NULL; + struct rcu_string *name; + u64 found_transid = btrfs_super_generation(disk_super); + u64 devid = btrfs_stack_device_id(&disk_super->dev_item); + dev_t path_devt; + int error; + bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); + bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & + BTRFS_SUPER_FLAG_CHANGING_FSID_V2); + + error = lookup_bdev(path, &path_devt); + if (error) { + btrfs_err(NULL, "failed to lookup block device for path %s: %d", + path, error); + return ERR_PTR(error); + } + + if (fsid_change_in_progress) { + if (!has_metadata_uuid) + fs_devices = find_fsid_inprogress(disk_super); + else + fs_devices = find_fsid_changed(disk_super); + } else if (has_metadata_uuid) { + fs_devices = find_fsid_with_metadata_uuid(disk_super); + } else { + fs_devices = find_fsid_reverted_metadata(disk_super); + if (!fs_devices) + fs_devices = find_fsid(disk_super->fsid, NULL); + } + + + if (!fs_devices) { + fs_devices = alloc_fs_devices(disk_super->fsid, + has_metadata_uuid ? disk_super->metadata_uuid : NULL); + if (IS_ERR(fs_devices)) + return ERR_CAST(fs_devices); + + fs_devices->fsid_change = fsid_change_in_progress; + + mutex_lock(&fs_devices->device_list_mutex); + list_add(&fs_devices->fs_list, &fs_uuids); + + device = NULL; + } else { + struct btrfs_dev_lookup_args args = { + .devid = devid, + .uuid = disk_super->dev_item.uuid, + }; + + mutex_lock(&fs_devices->device_list_mutex); + device = btrfs_find_device(fs_devices, &args); + + /* + * If this disk has been pulled into an fs devices created by + * a device which had the CHANGING_FSID_V2 flag then replace the + * metadata_uuid/fsid values of the fs_devices. + */ + if (fs_devices->fsid_change && + found_transid > fs_devices->latest_generation) { + memcpy(fs_devices->fsid, disk_super->fsid, + BTRFS_FSID_SIZE); + memcpy(fs_devices->metadata_uuid, + btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE); + fs_devices->fsid_change = false; + } + } + + if (!device) { + unsigned int nofs_flag; + + if (fs_devices->opened) { + btrfs_err(NULL, +"device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", + path, fs_devices->fsid, current->comm, + task_pid_nr(current)); + mutex_unlock(&fs_devices->device_list_mutex); + return ERR_PTR(-EBUSY); + } + + nofs_flag = memalloc_nofs_save(); + device = btrfs_alloc_device(NULL, &devid, + disk_super->dev_item.uuid, path); + memalloc_nofs_restore(nofs_flag); + if (IS_ERR(device)) { + mutex_unlock(&fs_devices->device_list_mutex); + /* we can safely leave the fs_devices entry around */ + return device; + } + + device->devt = path_devt; + + list_add_rcu(&device->dev_list, &fs_devices->devices); + fs_devices->num_devices++; + + device->fs_devices = fs_devices; + *new_device_added = true; + + if (disk_super->label[0]) + pr_info( + "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", + disk_super->label, devid, found_transid, path, + current->comm, task_pid_nr(current)); + else + pr_info( + "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", + disk_super->fsid, devid, found_transid, path, + current->comm, task_pid_nr(current)); + + } else if (!device->name || strcmp(device->name->str, path)) { + /* + * When FS is already mounted. + * 1. If you are here and if the device->name is NULL that + * means this device was missing at time of FS mount. + * 2. If you are here and if the device->name is different + * from 'path' that means either + * a. The same device disappeared and reappeared with + * different name. or + * b. The missing-disk-which-was-replaced, has + * reappeared now. + * + * We must allow 1 and 2a above. But 2b would be a spurious + * and unintentional. + * + * Further in case of 1 and 2a above, the disk at 'path' + * would have missed some transaction when it was away and + * in case of 2a the stale bdev has to be updated as well. + * 2b must not be allowed at all time. + */ + + /* + * For now, we do allow update to btrfs_fs_device through the + * btrfs dev scan cli after FS has been mounted. We're still + * tracking a problem where systems fail mount by subvolume id + * when we reject replacement on a mounted FS. + */ + if (!fs_devices->opened && found_transid < device->generation) { + /* + * That is if the FS is _not_ mounted and if you + * are here, that means there is more than one + * disk with same uuid and devid.We keep the one + * with larger generation number or the last-in if + * generation are equal. + */ + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_err(NULL, +"device %s already registered with a higher generation, found %llu expect %llu", + path, found_transid, device->generation); + return ERR_PTR(-EEXIST); + } + + /* + * We are going to replace the device path for a given devid, + * make sure it's the same device if the device is mounted + * + * NOTE: the device->fs_info may not be reliable here so pass + * in a NULL to message helpers instead. This avoids a possible + * use-after-free when the fs_info and fs_info->sb are already + * torn down. + */ + if (device->bdev) { + if (device->devt != path_devt) { + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_warn_in_rcu(NULL, + "duplicate device %s devid %llu generation %llu scanned by %s (%d)", + path, devid, found_transid, + current->comm, + task_pid_nr(current)); + return ERR_PTR(-EEXIST); + } + btrfs_info_in_rcu(NULL, + "devid %llu device path %s changed to %s scanned by %s (%d)", + devid, btrfs_dev_name(device), + path, current->comm, + task_pid_nr(current)); + } + + name = rcu_string_strdup(path, GFP_NOFS); + if (!name) { + mutex_unlock(&fs_devices->device_list_mutex); + return ERR_PTR(-ENOMEM); + } + rcu_string_free(device->name); + rcu_assign_pointer(device->name, name); + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { + fs_devices->missing_devices--; + clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); + } + device->devt = path_devt; + } + + /* + * Unmount does not free the btrfs_device struct but would zero + * generation along with most of the other members. So just update + * it back. We need it to pick the disk with largest generation + * (as above). + */ + if (!fs_devices->opened) { + device->generation = found_transid; + fs_devices->latest_generation = max_t(u64, found_transid, + fs_devices->latest_generation); + } + + fs_devices->total_devices = btrfs_super_num_devices(disk_super); + + mutex_unlock(&fs_devices->device_list_mutex); + return device; +} + +static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) +{ + struct btrfs_fs_devices *fs_devices; + struct btrfs_device *device; + struct btrfs_device *orig_dev; + int ret = 0; + + lockdep_assert_held(&uuid_mutex); + + fs_devices = alloc_fs_devices(orig->fsid, NULL); + if (IS_ERR(fs_devices)) + return fs_devices; + + fs_devices->total_devices = orig->total_devices; + + list_for_each_entry(orig_dev, &orig->devices, dev_list) { + const char *dev_path = NULL; + + /* + * This is ok to do without RCU read locked because we hold the + * uuid mutex so nothing we touch in here is going to disappear. + */ + if (orig_dev->name) + dev_path = orig_dev->name->str; + + device = btrfs_alloc_device(NULL, &orig_dev->devid, + orig_dev->uuid, dev_path); + if (IS_ERR(device)) { + ret = PTR_ERR(device); + goto error; + } + + if (orig_dev->zone_info) { + struct btrfs_zoned_device_info *zone_info; + + zone_info = btrfs_clone_dev_zone_info(orig_dev); + if (!zone_info) { + btrfs_free_device(device); + ret = -ENOMEM; + goto error; + } + device->zone_info = zone_info; + } + + list_add(&device->dev_list, &fs_devices->devices); + device->fs_devices = fs_devices; + fs_devices->num_devices++; + } + return fs_devices; +error: + free_fs_devices(fs_devices); + return ERR_PTR(ret); +} + +static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, + struct btrfs_device **latest_dev) +{ + struct btrfs_device *device, *next; + + /* This is the initialized path, it is safe to release the devices. */ + list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { + if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, + &device->dev_state) && + !test_bit(BTRFS_DEV_STATE_MISSING, + &device->dev_state) && + (!*latest_dev || + device->generation > (*latest_dev)->generation)) { + *latest_dev = device; + } + continue; + } + + /* + * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, + * in btrfs_init_dev_replace() so just continue. + */ + if (device->devid == BTRFS_DEV_REPLACE_DEVID) + continue; + + if (device->bdev) { + blkdev_put(device->bdev, device->holder); + device->bdev = NULL; + fs_devices->open_devices--; + } + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { + list_del_init(&device->dev_alloc_list); + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + fs_devices->rw_devices--; + } + list_del_init(&device->dev_list); + fs_devices->num_devices--; + btrfs_free_device(device); + } + +} + +/* + * After we have read the system tree and know devids belonging to this + * filesystem, remove the device which does not belong there. + */ +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *latest_dev = NULL; + struct btrfs_fs_devices *seed_dev; + + mutex_lock(&uuid_mutex); + __btrfs_free_extra_devids(fs_devices, &latest_dev); + + list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) + __btrfs_free_extra_devids(seed_dev, &latest_dev); + + fs_devices->latest_dev = latest_dev; + + mutex_unlock(&uuid_mutex); +} + +static void btrfs_close_bdev(struct btrfs_device *device) +{ + if (!device->bdev) + return; + + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { + sync_blockdev(device->bdev); + invalidate_bdev(device->bdev); + } + + blkdev_put(device->bdev, device->holder); +} + +static void btrfs_close_one_device(struct btrfs_device *device) +{ + struct btrfs_fs_devices *fs_devices = device->fs_devices; + + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && + device->devid != BTRFS_DEV_REPLACE_DEVID) { + list_del_init(&device->dev_alloc_list); + fs_devices->rw_devices--; + } + + if (device->devid == BTRFS_DEV_REPLACE_DEVID) + clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); + + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { + clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); + fs_devices->missing_devices--; + } + + btrfs_close_bdev(device); + if (device->bdev) { + fs_devices->open_devices--; + device->bdev = NULL; + } + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + btrfs_destroy_dev_zone_info(device); + + device->fs_info = NULL; + atomic_set(&device->dev_stats_ccnt, 0); + extent_io_tree_release(&device->alloc_state); + + /* + * Reset the flush error record. We might have a transient flush error + * in this mount, and if so we aborted the current transaction and set + * the fs to an error state, guaranteeing no super blocks can be further + * committed. However that error might be transient and if we unmount the + * filesystem and mount it again, we should allow the mount to succeed + * (btrfs_check_rw_degradable() should not fail) - if after mounting the + * filesystem again we still get flush errors, then we will again abort + * any transaction and set the error state, guaranteeing no commits of + * unsafe super blocks. + */ + device->last_flush_error = 0; + + /* Verify the device is back in a pristine state */ + WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); + WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); + WARN_ON(!list_empty(&device->dev_alloc_list)); + WARN_ON(!list_empty(&device->post_commit_list)); +} + +static void close_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device, *tmp; + + lockdep_assert_held(&uuid_mutex); + + if (--fs_devices->opened > 0) + return; + + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) + btrfs_close_one_device(device); + + WARN_ON(fs_devices->open_devices); + WARN_ON(fs_devices->rw_devices); + fs_devices->opened = 0; + fs_devices->seeding = false; + fs_devices->fs_info = NULL; +} + +void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +{ + LIST_HEAD(list); + struct btrfs_fs_devices *tmp; + + mutex_lock(&uuid_mutex); + close_fs_devices(fs_devices); + if (!fs_devices->opened) { + list_splice_init(&fs_devices->seed_list, &list); + + /* + * If the struct btrfs_fs_devices is not assembled with any + * other device, it can be re-initialized during the next mount + * without the needing device-scan step. Therefore, it can be + * fully freed. + */ + if (fs_devices->num_devices == 1) { + list_del(&fs_devices->fs_list); + free_fs_devices(fs_devices); + } + } + + + list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { + close_fs_devices(fs_devices); + list_del(&fs_devices->seed_list); + free_fs_devices(fs_devices); + } + mutex_unlock(&uuid_mutex); +} + +static int open_fs_devices(struct btrfs_fs_devices *fs_devices, + blk_mode_t flags, void *holder) +{ + struct btrfs_device *device; + struct btrfs_device *latest_dev = NULL; + struct btrfs_device *tmp_device; + + list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, + dev_list) { + int ret; + + ret = btrfs_open_one_device(fs_devices, device, flags, holder); + if (ret == 0 && + (!latest_dev || device->generation > latest_dev->generation)) { + latest_dev = device; + } else if (ret == -ENODATA) { + fs_devices->num_devices--; + list_del(&device->dev_list); + btrfs_free_device(device); + } + } + if (fs_devices->open_devices == 0) + return -EINVAL; + + fs_devices->opened = 1; + fs_devices->latest_dev = latest_dev; + fs_devices->total_rw_bytes = 0; + fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; + fs_devices->read_policy = BTRFS_READ_POLICY_PID; + + return 0; +} + +static int devid_cmp(void *priv, const struct list_head *a, + const struct list_head *b) +{ + const struct btrfs_device *dev1, *dev2; + + dev1 = list_entry(a, struct btrfs_device, dev_list); + dev2 = list_entry(b, struct btrfs_device, dev_list); + + if (dev1->devid < dev2->devid) + return -1; + else if (dev1->devid > dev2->devid) + return 1; + return 0; +} + +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + blk_mode_t flags, void *holder) +{ + int ret; + + lockdep_assert_held(&uuid_mutex); + /* + * The device_list_mutex cannot be taken here in case opening the + * underlying device takes further locks like open_mutex. + * + * We also don't need the lock here as this is called during mount and + * exclusion is provided by uuid_mutex + */ + + if (fs_devices->opened) { + fs_devices->opened++; + ret = 0; + } else { + list_sort(NULL, &fs_devices->devices, devid_cmp); + ret = open_fs_devices(fs_devices, flags, holder); + } + + return ret; +} + +void btrfs_release_disk_super(struct btrfs_super_block *super) +{ + struct page *page = virt_to_page(super); + + put_page(page); +} + +static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, + u64 bytenr, u64 bytenr_orig) +{ + struct btrfs_super_block *disk_super; + struct page *page; + void *p; + pgoff_t index; + + /* make sure our super fits in the device */ + if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) + return ERR_PTR(-EINVAL); + + /* make sure our super fits in the page */ + if (sizeof(*disk_super) > PAGE_SIZE) + return ERR_PTR(-EINVAL); + + /* make sure our super doesn't straddle pages on disk */ + index = bytenr >> PAGE_SHIFT; + if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) + return ERR_PTR(-EINVAL); + + /* pull in the page with our super */ + page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); + + if (IS_ERR(page)) + return ERR_CAST(page); + + p = page_address(page); + + /* align our pointer to the offset of the super block */ + disk_super = p + offset_in_page(bytenr); + + if (btrfs_super_bytenr(disk_super) != bytenr_orig || + btrfs_super_magic(disk_super) != BTRFS_MAGIC) { + btrfs_release_disk_super(p); + return ERR_PTR(-EINVAL); + } + + if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) + disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; + + return disk_super; +} + +int btrfs_forget_devices(dev_t devt) +{ + int ret; + + mutex_lock(&uuid_mutex); + ret = btrfs_free_stale_devices(devt, NULL); + mutex_unlock(&uuid_mutex); + + return ret; +} + +/* + * Look for a btrfs signature on a device. This may be called out of the mount path + * and we are not allowed to call set_blocksize during the scan. The superblock + * is read via pagecache + */ +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags) +{ + struct btrfs_super_block *disk_super; + bool new_device_added = false; + struct btrfs_device *device = NULL; + struct block_device *bdev; + u64 bytenr, bytenr_orig; + int ret; + + lockdep_assert_held(&uuid_mutex); + + /* + * we would like to check all the supers, but that would make + * a btrfs mount succeed after a mkfs from a different FS. + * So, we need to add a special mount option to scan for + * later supers, using BTRFS_SUPER_MIRROR_MAX instead + */ + + /* + * Avoid an exclusive open here, as the systemd-udev may initiate the + * device scan which may race with the user's mount or mkfs command, + * resulting in failure. + * Since the device scan is solely for reading purposes, there is no + * need for an exclusive open. Additionally, the devices are read again + * during the mount process. It is ok to get some inconsistent + * values temporarily, as the device paths of the fsid are the only + * required information for assembling the volume. + */ + bdev = blkdev_get_by_path(path, flags, NULL, NULL); + if (IS_ERR(bdev)) + return ERR_CAST(bdev); + + bytenr_orig = btrfs_sb_offset(0); + ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); + if (ret) { + device = ERR_PTR(ret); + goto error_bdev_put; + } + + disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); + if (IS_ERR(disk_super)) { + device = ERR_CAST(disk_super); + goto error_bdev_put; + } + + device = device_list_add(path, disk_super, &new_device_added); + if (!IS_ERR(device) && new_device_added) + btrfs_free_stale_devices(device->devt, device); + + btrfs_release_disk_super(disk_super); + +error_bdev_put: + blkdev_put(bdev, NULL); + + return device; +} + +/* + * Try to find a chunk that intersects [start, start + len] range and when one + * such is found, record the end of it in *start + */ +static bool contains_pending_extent(struct btrfs_device *device, u64 *start, + u64 len) +{ + u64 physical_start, physical_end; + + lockdep_assert_held(&device->fs_info->chunk_mutex); + + if (find_first_extent_bit(&device->alloc_state, *start, + &physical_start, &physical_end, + CHUNK_ALLOCATED, NULL)) { + + if (in_range(physical_start, *start, len) || + in_range(*start, physical_start, + physical_end - physical_start)) { + *start = physical_end + 1; + return true; + } + } + return false; +} + +static u64 dev_extent_search_start(struct btrfs_device *device) +{ + switch (device->fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + return BTRFS_DEVICE_RANGE_RESERVED; + case BTRFS_CHUNK_ALLOC_ZONED: + /* + * We don't care about the starting region like regular + * allocator, because we anyway use/reserve the first two zones + * for superblock logging. + */ + return 0; + default: + BUG(); + } +} + +static bool dev_extent_hole_check_zoned(struct btrfs_device *device, + u64 *hole_start, u64 *hole_size, + u64 num_bytes) +{ + u64 zone_size = device->zone_info->zone_size; + u64 pos; + int ret; + bool changed = false; + + ASSERT(IS_ALIGNED(*hole_start, zone_size)); + + while (*hole_size > 0) { + pos = btrfs_find_allocatable_zones(device, *hole_start, + *hole_start + *hole_size, + num_bytes); + if (pos != *hole_start) { + *hole_size = *hole_start + *hole_size - pos; + *hole_start = pos; + changed = true; + if (*hole_size < num_bytes) + break; + } + + ret = btrfs_ensure_empty_zones(device, pos, num_bytes); + + /* Range is ensured to be empty */ + if (!ret) + return changed; + + /* Given hole range was invalid (outside of device) */ + if (ret == -ERANGE) { + *hole_start += *hole_size; + *hole_size = 0; + return true; + } + + *hole_start += zone_size; + *hole_size -= zone_size; + changed = true; + } + + return changed; +} + +/* + * Check if specified hole is suitable for allocation. + * + * @device: the device which we have the hole + * @hole_start: starting position of the hole + * @hole_size: the size of the hole + * @num_bytes: the size of the free space that we need + * + * This function may modify @hole_start and @hole_size to reflect the suitable + * position for allocation. Returns 1 if hole position is updated, 0 otherwise. + */ +static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, + u64 *hole_size, u64 num_bytes) +{ + bool changed = false; + u64 hole_end = *hole_start + *hole_size; + + for (;;) { + /* + * Check before we set max_hole_start, otherwise we could end up + * sending back this offset anyway. + */ + if (contains_pending_extent(device, hole_start, *hole_size)) { + if (hole_end >= *hole_start) + *hole_size = hole_end - *hole_start; + else + *hole_size = 0; + changed = true; + } + + switch (device->fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + /* No extra check */ + break; + case BTRFS_CHUNK_ALLOC_ZONED: + if (dev_extent_hole_check_zoned(device, hole_start, + hole_size, num_bytes)) { + changed = true; + /* + * The changed hole can contain pending extent. + * Loop again to check that. + */ + continue; + } + break; + default: + BUG(); + } + + break; + } + + return changed; +} + +/* + * Find free space in the specified device. + * + * @device: the device which we search the free space in + * @num_bytes: the size of the free space that we need + * @search_start: the position from which to begin the search + * @start: store the start of the free space. + * @len: the size of the free space. that we find, or the size + * of the max free space if we don't find suitable free space + * + * This does a pretty simple search, the expectation is that it is called very + * infrequently and that a given device has a small number of extents. + * + * @start is used to store the start of the free space if we find. But if we + * don't find suitable free space, it will be used to store the start position + * of the max free space. + * + * @len is used to store the size of the free space that we find. + * But if we don't find suitable free space, it is used to store the size of + * the max free space. + * + * NOTE: This function will search *commit* root of device tree, and does extra + * check to ensure dev extents are not double allocated. + * This makes the function safe to allocate dev extents but may not report + * correct usable device space, as device extent freed in current transaction + * is not reported as available. + */ +static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *len) +{ + struct btrfs_fs_info *fs_info = device->fs_info; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_key key; + struct btrfs_dev_extent *dev_extent; + struct btrfs_path *path; + u64 search_start; + u64 hole_size; + u64 max_hole_start; + u64 max_hole_size = 0; + u64 extent_end; + u64 search_end = device->total_bytes; + int ret; + int slot; + struct extent_buffer *l; + + search_start = dev_extent_search_start(device); + max_hole_start = search_start; + + WARN_ON(device->zone_info && + !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } +again: + if (search_start >= search_end || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { + ret = -ENOSPC; + goto out; + } + + path->reada = READA_FORWARD; + path->search_commit_root = 1; + path->skip_locking = 1; + + key.objectid = device->devid; + key.offset = search_start; + key.type = BTRFS_DEV_EXTENT_KEY; + + ret = btrfs_search_backwards(root, &key, path); + if (ret < 0) + goto out; + + while (search_start < search_end) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid < device->devid) + goto next; + + if (key.objectid > device->devid) + break; + + if (key.type != BTRFS_DEV_EXTENT_KEY) + goto next; + + if (key.offset > search_end) + break; + + if (key.offset > search_start) { + hole_size = key.offset - search_start; + dev_extent_hole_check(device, &search_start, &hole_size, + num_bytes); + + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; + } + + /* + * If this free space is greater than which we need, + * it must be the max free space that we have found + * until now, so max_hole_start must point to the start + * of this free space and the length of this free space + * is stored in max_hole_size. Thus, we return + * max_hole_start and max_hole_size and go back to the + * caller. + */ + if (hole_size >= num_bytes) { + ret = 0; + goto out; + } + } + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + extent_end = key.offset + btrfs_dev_extent_length(l, + dev_extent); + if (extent_end > search_start) + search_start = extent_end; +next: + path->slots[0]++; + cond_resched(); + } + + /* + * At this point, search_start should be the end of + * allocated dev extents, and when shrinking the device, + * search_end may be smaller than search_start. + */ + if (search_end > search_start) { + hole_size = search_end - search_start; + if (dev_extent_hole_check(device, &search_start, &hole_size, + num_bytes)) { + btrfs_release_path(path); + goto again; + } + + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; + } + } + + /* See above. */ + if (max_hole_size < num_bytes) + ret = -ENOSPC; + else + ret = 0; + + ASSERT(max_hole_start + max_hole_size <= search_end); +out: + btrfs_free_path(path); + *start = max_hole_start; + if (len) + *len = max_hole_size; + return ret; +} + +static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 start, u64 *dev_extent_len) +{ + struct btrfs_fs_info *fs_info = device->fs_info; + struct btrfs_root *root = fs_info->dev_root; + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *leaf = NULL; + struct btrfs_dev_extent *extent = NULL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; +again: + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, + BTRFS_DEV_EXTENT_KEY); + if (ret) + goto out; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + BUG_ON(found_key.offset > start || found_key.offset + + btrfs_dev_extent_length(leaf, extent) < start); + key = found_key; + btrfs_release_path(path); + goto again; + } else if (ret == 0) { + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + } else { + goto out; + } + + *dev_extent_len = btrfs_dev_extent_length(leaf, extent); + + ret = btrfs_del_item(trans, root, path); + if (ret == 0) + set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); +out: + btrfs_free_path(path); + return ret; +} + +static u64 find_next_chunk(struct btrfs_fs_info *fs_info) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct rb_node *n; + u64 ret = 0; + + em_tree = &fs_info->mapping_tree; + read_lock(&em_tree->lock); + n = rb_last(&em_tree->map.rb_root); + if (n) { + em = rb_entry(n, struct extent_map, rb_node); + ret = em->start + em->len; + } + read_unlock(&em_tree->lock); + + return ret; +} + +static noinline int find_next_devid(struct btrfs_fs_info *fs_info, + u64 *devid_ret) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); + if (ret < 0) + goto error; + + if (ret == 0) { + /* Corruption */ + btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); + ret = -EUCLEAN; + goto error; + } + + ret = btrfs_previous_item(fs_info->chunk_root, path, + BTRFS_DEV_ITEMS_OBJECTID, + BTRFS_DEV_ITEM_KEY); + if (ret) { + *devid_ret = 1; + } else { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + *devid_ret = found_key.offset + 1; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +/* + * the device information is stored in the chunk root + * the btrfs_device struct should be fully filled in + */ +static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_dev_item *dev_item; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long ptr; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + btrfs_reserve_chunk_metadata(trans, true); + ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, + &key, sizeof(*dev_item)); + btrfs_trans_release_chunk_metadata(trans); + if (ret) + goto out; + + leaf = path->nodes[0]; + dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); + + btrfs_set_device_id(leaf, dev_item, device->devid); + btrfs_set_device_generation(leaf, dev_item, 0); + btrfs_set_device_type(leaf, dev_item, device->type); + btrfs_set_device_io_align(leaf, dev_item, device->io_align); + btrfs_set_device_io_width(leaf, dev_item, device->io_width); + btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); + btrfs_set_device_total_bytes(leaf, dev_item, + btrfs_device_get_disk_total_bytes(device)); + btrfs_set_device_bytes_used(leaf, dev_item, + btrfs_device_get_bytes_used(device)); + btrfs_set_device_group(leaf, dev_item, 0); + btrfs_set_device_seek_speed(leaf, dev_item, 0); + btrfs_set_device_bandwidth(leaf, dev_item, 0); + btrfs_set_device_start_offset(leaf, dev_item, 0); + + ptr = btrfs_device_uuid(dev_item); + write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); + ptr = btrfs_device_fsid(dev_item); + write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, + ptr, BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(trans, leaf); + + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +/* + * Function to update ctime/mtime for a given device path. + * Mainly used for ctime/mtime based probe like libblkid. + * + * We don't care about errors here, this is just to be kind to userspace. + */ +static void update_dev_time(const char *device_path) +{ + struct path path; + int ret; + + ret = kern_path(device_path, LOOKUP_FOLLOW, &path); + if (ret) + return; + + inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION); + path_put(&path); +} + +static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, + struct btrfs_device *device) +{ + struct btrfs_root *root = device->fs_info->chunk_root; + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + btrfs_reserve_chunk_metadata(trans, false); + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + btrfs_trans_release_chunk_metadata(trans); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + return ret; +} + +/* + * Verify that @num_devices satisfies the RAID profile constraints in the whole + * filesystem. It's up to the caller to adjust that number regarding eg. device + * replace. + */ +static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, + u64 num_devices) +{ + u64 all_avail; + unsigned seq; + int i; + + do { + seq = read_seqbegin(&fs_info->profiles_lock); + + all_avail = fs_info->avail_data_alloc_bits | + fs_info->avail_system_alloc_bits | + fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&fs_info->profiles_lock, seq)); + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (!(all_avail & btrfs_raid_array[i].bg_flag)) + continue; + + if (num_devices < btrfs_raid_array[i].devs_min) + return btrfs_raid_array[i].mindev_error; + } + + return 0; +} + +static struct btrfs_device * btrfs_find_next_active_device( + struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) +{ + struct btrfs_device *next_device; + + list_for_each_entry(next_device, &fs_devs->devices, dev_list) { + if (next_device != device && + !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) + && next_device->bdev) + return next_device; + } + + return NULL; +} + +/* + * Helper function to check if the given device is part of s_bdev / latest_dev + * and replace it with the provided or the next active device, in the context + * where this function called, there should be always be another device (or + * this_dev) which is active. + */ +void __cold btrfs_assign_next_active_device(struct btrfs_device *device, + struct btrfs_device *next_device) +{ + struct btrfs_fs_info *fs_info = device->fs_info; + + if (!next_device) + next_device = btrfs_find_next_active_device(fs_info->fs_devices, + device); + ASSERT(next_device); + + if (fs_info->sb->s_bdev && + (fs_info->sb->s_bdev == device->bdev)) + fs_info->sb->s_bdev = next_device->bdev; + + if (fs_info->fs_devices->latest_dev->bdev == device->bdev) + fs_info->fs_devices->latest_dev = next_device; +} + +/* + * Return btrfs_fs_devices::num_devices excluding the device that's being + * currently replaced. + */ +static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) +{ + u64 num_devices = fs_info->fs_devices->num_devices; + + down_read(&fs_info->dev_replace.rwsem); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { + ASSERT(num_devices > 1); + num_devices--; + } + up_read(&fs_info->dev_replace.rwsem); + + return num_devices; +} + +static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info, + struct block_device *bdev, int copy_num) +{ + struct btrfs_super_block *disk_super; + const size_t len = sizeof(disk_super->magic); + const u64 bytenr = btrfs_sb_offset(copy_num); + int ret; + + disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr); + if (IS_ERR(disk_super)) + return; + + memset(&disk_super->magic, 0, len); + folio_mark_dirty(virt_to_folio(disk_super)); + btrfs_release_disk_super(disk_super); + + ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1); + if (ret) + btrfs_warn(fs_info, "error clearing superblock number %d (%d)", + copy_num, ret); +} + +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, + struct block_device *bdev, + const char *device_path) +{ + int copy_num; + + if (!bdev) + return; + + for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { + if (bdev_is_zoned(bdev)) + btrfs_reset_sb_log_zones(bdev, copy_num); + else + btrfs_scratch_superblock(fs_info, bdev, copy_num); + } + + /* Notify udev that device has changed */ + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + + /* Update ctime/mtime for device path for libblkid */ + update_dev_time(device_path); +} + +int btrfs_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_dev_lookup_args *args, + struct block_device **bdev, void **holder) +{ + struct btrfs_trans_handle *trans; + struct btrfs_device *device; + struct btrfs_fs_devices *cur_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u64 num_devices; + int ret = 0; + + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "device remove not supported on extent tree v2 yet"); + return -EINVAL; + } + + /* + * The device list in fs_devices is accessed without locks (neither + * uuid_mutex nor device_list_mutex) as it won't change on a mounted + * filesystem and another device rm cannot run. + */ + num_devices = btrfs_num_devices(fs_info); + + ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); + if (ret) + return ret; + + device = btrfs_find_device(fs_info->fs_devices, args); + if (!device) { + if (args->missing) + ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; + else + ret = -ENOENT; + return ret; + } + + if (btrfs_pinned_by_swapfile(fs_info, device)) { + btrfs_warn_in_rcu(fs_info, + "cannot remove device %s (devid %llu) due to active swapfile", + btrfs_dev_name(device), device->devid); + return -ETXTBSY; + } + + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) + return BTRFS_ERROR_DEV_TGT_REPLACE; + + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && + fs_info->fs_devices->rw_devices == 1) + return BTRFS_ERROR_DEV_ONLY_WRITABLE; + + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { + mutex_lock(&fs_info->chunk_mutex); + list_del_init(&device->dev_alloc_list); + device->fs_devices->rw_devices--; + mutex_unlock(&fs_info->chunk_mutex); + } + + ret = btrfs_shrink_device(device, 0); + if (ret) + goto error_undo; + + trans = btrfs_start_transaction(fs_info->chunk_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto error_undo; + } + + ret = btrfs_rm_dev_item(trans, device); + if (ret) { + /* Any error in dev item removal is critical */ + btrfs_crit(fs_info, + "failed to remove device item for devid %llu: %d", + device->devid, ret); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } + + clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + btrfs_scrub_cancel_dev(device); + + /* + * the device list mutex makes sure that we don't change + * the device list while someone else is writing out all + * the device supers. Whoever is writing all supers, should + * lock the device list mutex before getting the number of + * devices in the super block (super_copy). Conversely, + * whoever updates the number of devices in the super block + * (super_copy) should hold the device list mutex. + */ + + /* + * In normal cases the cur_devices == fs_devices. But in case + * of deleting a seed device, the cur_devices should point to + * its own fs_devices listed under the fs_devices->seed_list. + */ + cur_devices = device->fs_devices; + mutex_lock(&fs_devices->device_list_mutex); + list_del_rcu(&device->dev_list); + + cur_devices->num_devices--; + cur_devices->total_devices--; + /* Update total_devices of the parent fs_devices if it's seed */ + if (cur_devices != fs_devices) + fs_devices->total_devices--; + + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) + cur_devices->missing_devices--; + + btrfs_assign_next_active_device(device, NULL); + + if (device->bdev) { + cur_devices->open_devices--; + /* remove sysfs entry */ + btrfs_sysfs_remove_device(device); + } + + num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; + btrfs_set_super_num_devices(fs_info->super_copy, num_devices); + mutex_unlock(&fs_devices->device_list_mutex); + + /* + * At this point, the device is zero sized and detached from the + * devices list. All that's left is to zero out the old supers and + * free the device. + * + * We cannot call btrfs_close_bdev() here because we're holding the sb + * write lock, and blkdev_put() will pull in the ->open_mutex on the + * block device and it's dependencies. Instead just flush the device + * and let the caller do the final blkdev_put. + */ + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { + btrfs_scratch_superblocks(fs_info, device->bdev, + device->name->str); + if (device->bdev) { + sync_blockdev(device->bdev); + invalidate_bdev(device->bdev); + } + } + + *bdev = device->bdev; + *holder = device->holder; + synchronize_rcu(); + btrfs_free_device(device); + + /* + * This can happen if cur_devices is the private seed devices list. We + * cannot call close_fs_devices() here because it expects the uuid_mutex + * to be held, but in fact we don't need that for the private + * seed_devices, we can simply decrement cur_devices->opened and then + * remove it from our list and free the fs_devices. + */ + if (cur_devices->num_devices == 0) { + list_del_init(&cur_devices->seed_list); + ASSERT(cur_devices->opened == 1); + cur_devices->opened--; + free_fs_devices(cur_devices); + } + + ret = btrfs_commit_transaction(trans); + + return ret; + +error_undo: + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { + mutex_lock(&fs_info->chunk_mutex); + list_add(&device->dev_alloc_list, + &fs_devices->alloc_list); + device->fs_devices->rw_devices++; + mutex_unlock(&fs_info->chunk_mutex); + } + return ret; +} + +void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) +{ + struct btrfs_fs_devices *fs_devices; + + lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); + + /* + * in case of fs with no seed, srcdev->fs_devices will point + * to fs_devices of fs_info. However when the dev being replaced is + * a seed dev it will point to the seed's local fs_devices. In short + * srcdev will have its correct fs_devices in both the cases. + */ + fs_devices = srcdev->fs_devices; + + list_del_rcu(&srcdev->dev_list); + list_del(&srcdev->dev_alloc_list); + fs_devices->num_devices--; + if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) + fs_devices->missing_devices--; + + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) + fs_devices->rw_devices--; + + if (srcdev->bdev) + fs_devices->open_devices--; +} + +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) +{ + struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; + + mutex_lock(&uuid_mutex); + + btrfs_close_bdev(srcdev); + synchronize_rcu(); + btrfs_free_device(srcdev); + + /* if this is no devs we rather delete the fs_devices */ + if (!fs_devices->num_devices) { + /* + * On a mounted FS, num_devices can't be zero unless it's a + * seed. In case of a seed device being replaced, the replace + * target added to the sprout FS, so there will be no more + * device left under the seed FS. + */ + ASSERT(fs_devices->seeding); + + list_del_init(&fs_devices->seed_list); + close_fs_devices(fs_devices); + free_fs_devices(fs_devices); + } + mutex_unlock(&uuid_mutex); +} + +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) +{ + struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; + + mutex_lock(&fs_devices->device_list_mutex); + + btrfs_sysfs_remove_device(tgtdev); + + if (tgtdev->bdev) + fs_devices->open_devices--; + + fs_devices->num_devices--; + + btrfs_assign_next_active_device(tgtdev, NULL); + + list_del_rcu(&tgtdev->dev_list); + + mutex_unlock(&fs_devices->device_list_mutex); + + btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, + tgtdev->name->str); + + btrfs_close_bdev(tgtdev); + synchronize_rcu(); + btrfs_free_device(tgtdev); +} + +/* + * Populate args from device at path. + * + * @fs_info: the filesystem + * @args: the args to populate + * @path: the path to the device + * + * This will read the super block of the device at @path and populate @args with + * the devid, fsid, and uuid. This is meant to be used for ioctls that need to + * lookup a device to operate on, but need to do it before we take any locks. + * This properly handles the special case of "missing" that a user may pass in, + * and does some basic sanity checks. The caller must make sure that @path is + * properly NUL terminated before calling in, and must call + * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and + * uuid buffers. + * + * Return: 0 for success, -errno for failure + */ +int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, + struct btrfs_dev_lookup_args *args, + const char *path) +{ + struct btrfs_super_block *disk_super; + struct block_device *bdev; + int ret; + + if (!path || !path[0]) + return -EINVAL; + if (!strcmp(path, "missing")) { + args->missing = true; + return 0; + } + + args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL); + args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL); + if (!args->uuid || !args->fsid) { + btrfs_put_dev_args_from_path(args); + return -ENOMEM; + } + + ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0, + &bdev, &disk_super); + if (ret) { + btrfs_put_dev_args_from_path(args); + return ret; + } + + args->devid = btrfs_stack_device_id(&disk_super->dev_item); + memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) + memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); + else + memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); + btrfs_release_disk_super(disk_super); + blkdev_put(bdev, NULL); + return 0; +} + +/* + * Only use this jointly with btrfs_get_dev_args_from_path() because we will + * allocate our ->uuid and ->fsid pointers, everybody else uses local variables + * that don't need to be freed. + */ +void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args) +{ + kfree(args->uuid); + kfree(args->fsid); + args->uuid = NULL; + args->fsid = NULL; +} + +struct btrfs_device *btrfs_find_device_by_devspec( + struct btrfs_fs_info *fs_info, u64 devid, + const char *device_path) +{ + BTRFS_DEV_LOOKUP_ARGS(args); + struct btrfs_device *device; + int ret; + + if (devid) { + args.devid = devid; + device = btrfs_find_device(fs_info->fs_devices, &args); + if (!device) + return ERR_PTR(-ENOENT); + return device; + } + + ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path); + if (ret) + return ERR_PTR(ret); + device = btrfs_find_device(fs_info->fs_devices, &args); + btrfs_put_dev_args_from_path(&args); + if (!device) + return ERR_PTR(-ENOENT); + return device; +} + +static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *old_devices; + struct btrfs_fs_devices *seed_devices; + + lockdep_assert_held(&uuid_mutex); + if (!fs_devices->seeding) + return ERR_PTR(-EINVAL); + + /* + * Private copy of the seed devices, anchored at + * fs_info->fs_devices->seed_list + */ + seed_devices = alloc_fs_devices(NULL, NULL); + if (IS_ERR(seed_devices)) + return seed_devices; + + /* + * It's necessary to retain a copy of the original seed fs_devices in + * fs_uuids so that filesystems which have been seeded can successfully + * reference the seed device from open_seed_devices. This also supports + * multiple fs seed. + */ + old_devices = clone_fs_devices(fs_devices); + if (IS_ERR(old_devices)) { + kfree(seed_devices); + return old_devices; + } + + list_add(&old_devices->fs_list, &fs_uuids); + + memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); + seed_devices->opened = 1; + INIT_LIST_HEAD(&seed_devices->devices); + INIT_LIST_HEAD(&seed_devices->alloc_list); + mutex_init(&seed_devices->device_list_mutex); + + return seed_devices; +} + +/* + * Splice seed devices into the sprout fs_devices. + * Generate a new fsid for the sprouted read-write filesystem. + */ +static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *seed_devices) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_super_block *disk_super = fs_info->super_copy; + struct btrfs_device *device; + u64 super_flags; + + /* + * We are updating the fsid, the thread leading to device_list_add() + * could race, so uuid_mutex is needed. + */ + lockdep_assert_held(&uuid_mutex); + + /* + * The threads listed below may traverse dev_list but can do that without + * device_list_mutex: + * - All device ops and balance - as we are in btrfs_exclop_start. + * - Various dev_list readers - are using RCU. + * - btrfs_ioctl_fitrim() - is using RCU. + * + * For-read threads as below are using device_list_mutex: + * - Readonly scrub btrfs_scrub_dev() + * - Readonly scrub btrfs_scrub_progress() + * - btrfs_get_dev_stats() + */ + lockdep_assert_held(&fs_devices->device_list_mutex); + + list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, + synchronize_rcu); + list_for_each_entry(device, &seed_devices->devices, dev_list) + device->fs_devices = seed_devices; + + fs_devices->seeding = false; + fs_devices->num_devices = 0; + fs_devices->open_devices = 0; + fs_devices->missing_devices = 0; + fs_devices->rotating = false; + list_add(&seed_devices->seed_list, &fs_devices->seed_list); + + generate_random_uuid(fs_devices->fsid); + memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); + memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + + super_flags = btrfs_super_flags(disk_super) & + ~BTRFS_SUPER_FLAG_SEEDING; + btrfs_set_super_flags(disk_super, super_flags); +} + +/* + * Store the expected generation for seed devices in device items. + */ +static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) +{ + BTRFS_DEV_LOOKUP_ARGS(args); + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root = fs_info->chunk_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_dev_item *dev_item; + struct btrfs_device *device; + struct btrfs_key key; + u8 fs_uuid[BTRFS_FSID_SIZE]; + u8 dev_uuid[BTRFS_UUID_SIZE]; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.offset = 0; + key.type = BTRFS_DEV_ITEM_KEY; + + while (1) { + btrfs_reserve_chunk_metadata(trans, false); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + btrfs_trans_release_chunk_metadata(trans); + if (ret < 0) + goto error; + + leaf = path->nodes[0]; +next_slot: + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret > 0) + break; + if (ret < 0) + goto error; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(path); + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || + key.type != BTRFS_DEV_ITEM_KEY) + break; + + dev_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_item); + args.devid = btrfs_device_id(leaf, dev_item); + read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), + BTRFS_UUID_SIZE); + read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), + BTRFS_FSID_SIZE); + args.uuid = dev_uuid; + args.fsid = fs_uuid; + device = btrfs_find_device(fs_info->fs_devices, &args); + BUG_ON(!device); /* Logic error */ + + if (device->fs_devices->seeding) { + btrfs_set_device_generation(leaf, dev_item, + device->generation); + btrfs_mark_buffer_dirty(trans, leaf); + } + + path->slots[0]++; + goto next_slot; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) +{ + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_trans_handle *trans; + struct btrfs_device *device; + struct block_device *bdev; + struct super_block *sb = fs_info->sb; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *seed_devices = NULL; + u64 orig_super_total_bytes; + u64 orig_super_num_devices; + int ret = 0; + bool seeding_dev = false; + bool locked = false; + + if (sb_rdonly(sb) && !fs_devices->seeding) + return -EROFS; + + bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE, + fs_info->bdev_holder, NULL); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + if (!btrfs_check_device_zone_type(fs_info, bdev)) { + ret = -EINVAL; + goto error; + } + + if (fs_devices->seeding) { + seeding_dev = true; + down_write(&sb->s_umount); + mutex_lock(&uuid_mutex); + locked = true; + } + + sync_blockdev(bdev); + + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { + if (device->bdev == bdev) { + ret = -EEXIST; + rcu_read_unlock(); + goto error; + } + } + rcu_read_unlock(); + + device = btrfs_alloc_device(fs_info, NULL, NULL, device_path); + if (IS_ERR(device)) { + /* we can safely leave the fs_devices entry around */ + ret = PTR_ERR(device); + goto error; + } + + device->fs_info = fs_info; + device->bdev = bdev; + ret = lookup_bdev(device_path, &device->devt); + if (ret) + goto error_free_device; + + ret = btrfs_get_dev_zone_info(device, false); + if (ret) + goto error_free_device; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto error_free_zone; + } + + set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + device->generation = trans->transid; + device->io_width = fs_info->sectorsize; + device->io_align = fs_info->sectorsize; + device->sector_size = fs_info->sectorsize; + device->total_bytes = + round_down(bdev_nr_bytes(bdev), fs_info->sectorsize); + device->disk_total_bytes = device->total_bytes; + device->commit_total_bytes = device->total_bytes; + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); + device->holder = fs_info->bdev_holder; + device->dev_stats_valid = 1; + set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); + + if (seeding_dev) { + btrfs_clear_sb_rdonly(sb); + + /* GFP_KERNEL allocation must not be under device_list_mutex */ + seed_devices = btrfs_init_sprout(fs_info); + if (IS_ERR(seed_devices)) { + ret = PTR_ERR(seed_devices); + btrfs_abort_transaction(trans, ret); + goto error_trans; + } + } + + mutex_lock(&fs_devices->device_list_mutex); + if (seeding_dev) { + btrfs_setup_sprout(fs_info, seed_devices); + btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, + device); + } + + device->fs_devices = fs_devices; + + mutex_lock(&fs_info->chunk_mutex); + list_add_rcu(&device->dev_list, &fs_devices->devices); + list_add(&device->dev_alloc_list, &fs_devices->alloc_list); + fs_devices->num_devices++; + fs_devices->open_devices++; + fs_devices->rw_devices++; + fs_devices->total_devices++; + fs_devices->total_rw_bytes += device->total_bytes; + + atomic64_add(device->total_bytes, &fs_info->free_chunk_space); + + if (!bdev_nonrot(bdev)) + fs_devices->rotating = true; + + orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); + btrfs_set_super_total_bytes(fs_info->super_copy, + round_down(orig_super_total_bytes + device->total_bytes, + fs_info->sectorsize)); + + orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); + btrfs_set_super_num_devices(fs_info->super_copy, + orig_super_num_devices + 1); + + /* + * we've got more storage, clear any full flags on the space + * infos + */ + btrfs_clear_space_info_full(fs_info); + + mutex_unlock(&fs_info->chunk_mutex); + + /* Add sysfs device entry */ + btrfs_sysfs_add_device(device); + + mutex_unlock(&fs_devices->device_list_mutex); + + if (seeding_dev) { + mutex_lock(&fs_info->chunk_mutex); + ret = init_first_rw_device(trans); + mutex_unlock(&fs_info->chunk_mutex); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto error_sysfs; + } + } + + ret = btrfs_add_dev_item(trans, device); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto error_sysfs; + } + + if (seeding_dev) { + ret = btrfs_finish_sprout(trans); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto error_sysfs; + } + + /* + * fs_devices now represents the newly sprouted filesystem and + * its fsid has been changed by btrfs_sprout_splice(). + */ + btrfs_sysfs_update_sprout_fsid(fs_devices); + } + + ret = btrfs_commit_transaction(trans); + + if (seeding_dev) { + mutex_unlock(&uuid_mutex); + up_write(&sb->s_umount); + locked = false; + + if (ret) /* transaction commit */ + return ret; + + ret = btrfs_relocate_sys_chunks(fs_info); + if (ret < 0) + btrfs_handle_fs_error(fs_info, ret, + "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) == -ENOENT) + return 0; + ret = PTR_ERR(trans); + trans = NULL; + goto error_sysfs; + } + ret = btrfs_commit_transaction(trans); + } + + /* + * Now that we have written a new super block to this device, check all + * other fs_devices list if device_path alienates any other scanned + * device. + * We can ignore the return value as it typically returns -EINVAL and + * only succeeds if the device was an alien. + */ + btrfs_forget_devices(device->devt); + + /* Update ctime/mtime for blkid or udev */ + update_dev_time(device_path); + + return ret; + +error_sysfs: + btrfs_sysfs_remove_device(device); + mutex_lock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_info->chunk_mutex); + list_del_rcu(&device->dev_list); + list_del(&device->dev_alloc_list); + fs_info->fs_devices->num_devices--; + fs_info->fs_devices->open_devices--; + fs_info->fs_devices->rw_devices--; + fs_info->fs_devices->total_devices--; + fs_info->fs_devices->total_rw_bytes -= device->total_bytes; + atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); + btrfs_set_super_total_bytes(fs_info->super_copy, + orig_super_total_bytes); + btrfs_set_super_num_devices(fs_info->super_copy, + orig_super_num_devices); + mutex_unlock(&fs_info->chunk_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); +error_trans: + if (seeding_dev) + btrfs_set_sb_rdonly(sb); + if (trans) + btrfs_end_transaction(trans); +error_free_zone: + btrfs_destroy_dev_zone_info(device); +error_free_device: + btrfs_free_device(device); +error: + blkdev_put(bdev, fs_info->bdev_holder); + if (locked) { + mutex_unlock(&uuid_mutex); + up_write(&sb->s_umount); + } + return ret; +} + +static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root = device->fs_info->chunk_root; + struct btrfs_dev_item *dev_item; + struct extent_buffer *leaf; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); + + btrfs_set_device_id(leaf, dev_item, device->devid); + btrfs_set_device_type(leaf, dev_item, device->type); + btrfs_set_device_io_align(leaf, dev_item, device->io_align); + btrfs_set_device_io_width(leaf, dev_item, device->io_width); + btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); + btrfs_set_device_total_bytes(leaf, dev_item, + btrfs_device_get_disk_total_bytes(device)); + btrfs_set_device_bytes_used(leaf, dev_item, + btrfs_device_get_bytes_used(device)); + btrfs_mark_buffer_dirty(trans, leaf); + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size) +{ + struct btrfs_fs_info *fs_info = device->fs_info; + struct btrfs_super_block *super_copy = fs_info->super_copy; + u64 old_total; + u64 diff; + int ret; + + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + return -EACCES; + + new_size = round_down(new_size, fs_info->sectorsize); + + mutex_lock(&fs_info->chunk_mutex); + old_total = btrfs_super_total_bytes(super_copy); + diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); + + if (new_size <= device->total_bytes || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { + mutex_unlock(&fs_info->chunk_mutex); + return -EINVAL; + } + + btrfs_set_super_total_bytes(super_copy, + round_down(old_total + diff, fs_info->sectorsize)); + device->fs_devices->total_rw_bytes += diff; + + btrfs_device_set_total_bytes(device, new_size); + btrfs_device_set_disk_total_bytes(device, new_size); + btrfs_clear_space_info_full(device->fs_info); + if (list_empty(&device->post_commit_list)) + list_add_tail(&device->post_commit_list, + &trans->transaction->dev_update_list); + mutex_unlock(&fs_info->chunk_mutex); + + btrfs_reserve_chunk_metadata(trans, false); + ret = btrfs_update_device(trans, device); + btrfs_trans_release_chunk_metadata(trans); + + return ret; +} + +static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root = fs_info->chunk_root; + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = chunk_offset; + key.type = BTRFS_CHUNK_ITEM_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + else if (ret > 0) { /* Logic error or corruption */ + btrfs_handle_fs_error(fs_info, -ENOENT, + "Failed lookup while freeing chunk."); + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); + if (ret < 0) + btrfs_handle_fs_error(fs_info, ret, + "Failed to delete chunk item."); +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) +{ + struct btrfs_super_block *super_copy = fs_info->super_copy; + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + u8 *ptr; + int ret = 0; + u32 num_stripes; + u32 array_size; + u32 len = 0; + u32 cur; + struct btrfs_key key; + + lockdep_assert_held(&fs_info->chunk_mutex); + array_size = btrfs_super_sys_array_size(super_copy); + + ptr = super_copy->sys_chunk_array; + cur = 0; + + while (cur < array_size) { + disk_key = (struct btrfs_disk_key *)ptr; + btrfs_disk_key_to_cpu(&key, disk_key); + + len = sizeof(*disk_key); + + if (key.type == BTRFS_CHUNK_ITEM_KEY) { + chunk = (struct btrfs_chunk *)(ptr + len); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + len += btrfs_chunk_item_size(num_stripes); + } else { + ret = -EIO; + break; + } + if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && + key.offset == chunk_offset) { + memmove(ptr, ptr + len, array_size - (cur + len)); + array_size -= len; + btrfs_set_super_sys_array_size(super_copy, array_size); + } else { + ptr += len; + cur += len; + } + } + return ret; +} + +/* + * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. + * @logical: Logical block offset in bytes. + * @length: Length of extent in bytes. + * + * Return: Chunk mapping or ERR_PTR. + */ +struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + + em_tree = &fs_info->mapping_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, length); + read_unlock(&em_tree->lock); + + if (!em) { + btrfs_crit(fs_info, + "unable to find chunk map for logical %llu length %llu", + logical, length); + return ERR_PTR(-EINVAL); + } + + if (em->start > logical || em->start + em->len <= logical) { + btrfs_crit(fs_info, + "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", + logical, logical + length, em->start, em->start + em->len); + free_extent_map(em); + return ERR_PTR(-EINVAL); + } + + /* callers are responsible for dropping em's ref. */ + return em; +} + +static int remove_chunk_item(struct btrfs_trans_handle *trans, + struct map_lookup *map, u64 chunk_offset) +{ + int i; + + /* + * Removing chunk items and updating the device items in the chunks btree + * requires holding the chunk_mutex. + * See the comment at btrfs_chunk_alloc() for the details. + */ + lockdep_assert_held(&trans->fs_info->chunk_mutex); + + for (i = 0; i < map->num_stripes; i++) { + int ret; + + ret = btrfs_update_device(trans, map->stripes[i].dev); + if (ret) + return ret; + } + + return btrfs_free_chunk(trans, chunk_offset); +} + +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct extent_map *em; + struct map_lookup *map; + u64 dev_extent_len = 0; + int i, ret = 0; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(em)) { + /* + * This is a logic error, but we don't want to just rely on the + * user having built with ASSERT enabled, so if ASSERT doesn't + * do anything we still error out. + */ + ASSERT(0); + return PTR_ERR(em); + } + map = em->map_lookup; + + /* + * First delete the device extent items from the devices btree. + * We take the device_list_mutex to avoid racing with the finishing phase + * of a device replace operation. See the comment below before acquiring + * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex + * because that can result in a deadlock when deleting the device extent + * items from the devices btree - COWing an extent buffer from the btree + * may result in allocating a new metadata chunk, which would attempt to + * lock again fs_info->chunk_mutex. + */ + mutex_lock(&fs_devices->device_list_mutex); + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *device = map->stripes[i].dev; + ret = btrfs_free_dev_extent(trans, device, + map->stripes[i].physical, + &dev_extent_len); + if (ret) { + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_abort_transaction(trans, ret); + goto out; + } + + if (device->bytes_used > 0) { + mutex_lock(&fs_info->chunk_mutex); + btrfs_device_set_bytes_used(device, + device->bytes_used - dev_extent_len); + atomic64_add(dev_extent_len, &fs_info->free_chunk_space); + btrfs_clear_space_info_full(fs_info); + mutex_unlock(&fs_info->chunk_mutex); + } + } + mutex_unlock(&fs_devices->device_list_mutex); + + /* + * We acquire fs_info->chunk_mutex for 2 reasons: + * + * 1) Just like with the first phase of the chunk allocation, we must + * reserve system space, do all chunk btree updates and deletions, and + * update the system chunk array in the superblock while holding this + * mutex. This is for similar reasons as explained on the comment at + * the top of btrfs_chunk_alloc(); + * + * 2) Prevent races with the final phase of a device replace operation + * that replaces the device object associated with the map's stripes, + * because the device object's id can change at any time during that + * final phase of the device replace operation + * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the + * replaced device and then see it with an ID of + * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating + * the device item, which does not exists on the chunk btree. + * The finishing phase of device replace acquires both the + * device_list_mutex and the chunk_mutex, in that order, so we are + * safe by just acquiring the chunk_mutex. + */ + trans->removing_chunk = true; + mutex_lock(&fs_info->chunk_mutex); + + check_system_chunk(trans, map->type); + + ret = remove_chunk_item(trans, map, chunk_offset); + /* + * Normally we should not get -ENOSPC since we reserved space before + * through the call to check_system_chunk(). + * + * Despite our system space_info having enough free space, we may not + * be able to allocate extents from its block groups, because all have + * an incompatible profile, which will force us to allocate a new system + * block group with the right profile, or right after we called + * check_system_space() above, a scrub turned the only system block group + * with enough free space into RO mode. + * This is explained with more detail at do_chunk_alloc(). + * + * So if we get -ENOSPC, allocate a new system chunk and retry once. + */ + if (ret == -ENOSPC) { + const u64 sys_flags = btrfs_system_alloc_profile(fs_info); + struct btrfs_block_group *sys_bg; + + sys_bg = btrfs_create_chunk(trans, sys_flags); + if (IS_ERR(sys_bg)) { + ret = PTR_ERR(sys_bg); + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = remove_chunk_item(trans, map, chunk_offset); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + } else if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); + + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_del_sys_chunk(fs_info, chunk_offset); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + } + + mutex_unlock(&fs_info->chunk_mutex); + trans->removing_chunk = false; + + /* + * We are done with chunk btree updates and deletions, so release the + * system space we previously reserved (with check_system_chunk()). + */ + btrfs_trans_release_chunk_metadata(trans); + + ret = btrfs_remove_block_group(trans, chunk_offset, em); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + +out: + if (trans->removing_chunk) { + mutex_unlock(&fs_info->chunk_mutex); + trans->removing_chunk = false; + } + /* once for us */ + free_extent_map(em); + return ret; +} + +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) +{ + struct btrfs_root *root = fs_info->chunk_root; + struct btrfs_trans_handle *trans; + struct btrfs_block_group *block_group; + u64 length; + int ret; + + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, + "relocate: not supported on extent tree v2 yet"); + return -EINVAL; + } + + /* + * Prevent races with automatic removal of unused block groups. + * After we relocate and before we remove the chunk with offset + * chunk_offset, automatic removal of the block group can kick in, + * resulting in a failure when calling btrfs_remove_chunk() below. + * + * Make sure to acquire this mutex before doing a tree search (dev + * or chunk trees) to find chunks. Otherwise the cleaner kthread might + * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after + * we release the path used to search the chunk/dev tree and before + * the current task acquires this mutex and calls us. + */ + lockdep_assert_held(&fs_info->reclaim_bgs_lock); + + /* step one, relocate all the extents inside this chunk */ + btrfs_scrub_pause(fs_info); + ret = btrfs_relocate_block_group(fs_info, chunk_offset); + btrfs_scrub_continue(fs_info); + if (ret) { + /* + * If we had a transaction abort, stop all running scrubs. + * See transaction.c:cleanup_transaction() why we do it here. + */ + if (BTRFS_FS_ERROR(fs_info)) + btrfs_scrub_cancel(fs_info); + return ret; + } + + block_group = btrfs_lookup_block_group(fs_info, chunk_offset); + if (!block_group) + return -ENOENT; + btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); + length = block_group->length; + btrfs_put_block_group(block_group); + + /* + * On a zoned file system, discard the whole block group, this will + * trigger a REQ_OP_ZONE_RESET operation on the device zone. If + * resetting the zone fails, don't treat it as a fatal problem from the + * filesystem's point of view. + */ + if (btrfs_is_zoned(fs_info)) { + ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); + if (ret) + btrfs_info(fs_info, + "failed to reset zone %llu after relocation", + chunk_offset); + } + + trans = btrfs_start_trans_remove_block_group(root->fs_info, + chunk_offset); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_handle_fs_error(root->fs_info, ret, NULL); + return ret; + } + + /* + * step two, delete the device extents and the + * chunk tree entries + */ + ret = btrfs_remove_chunk(trans, chunk_offset); + btrfs_end_transaction(trans); + return ret; +} + +static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *chunk_root = fs_info->chunk_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_chunk *chunk; + struct btrfs_key key; + struct btrfs_key found_key; + u64 chunk_type; + bool retried = false; + int failed = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + +again: + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + while (1) { + mutex_lock(&fs_info->reclaim_bgs_lock); + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); + if (ret < 0) { + mutex_unlock(&fs_info->reclaim_bgs_lock); + goto error; + } + BUG_ON(ret == 0); /* Corruption */ + + ret = btrfs_previous_item(chunk_root, path, key.objectid, + key.type); + if (ret) + mutex_unlock(&fs_info->reclaim_bgs_lock); + if (ret < 0) + goto error; + if (ret > 0) + break; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + chunk = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_chunk); + chunk_type = btrfs_chunk_type(leaf, chunk); + btrfs_release_path(path); + + if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_relocate_chunk(fs_info, found_key.offset); + if (ret == -ENOSPC) + failed++; + else + BUG_ON(ret); + } + mutex_unlock(&fs_info->reclaim_bgs_lock); + + if (found_key.offset == 0) + break; + key.offset = found_key.offset - 1; + } + ret = 0; + if (failed && !retried) { + failed = 0; + retried = true; + goto again; + } else if (WARN_ON(failed && retried)) { + ret = -ENOSPC; + } +error: + btrfs_free_path(path); + return ret; +} + +/* + * return 1 : allocate a data chunk successfully, + * return <0: errors during allocating a data chunk, + * return 0 : no need to allocate a data chunk. + */ +static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, + u64 chunk_offset) +{ + struct btrfs_block_group *cache; + u64 bytes_used; + u64 chunk_type; + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + ASSERT(cache); + chunk_type = cache->flags; + btrfs_put_block_group(cache); + + if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) + return 0; + + spin_lock(&fs_info->data_sinfo->lock); + bytes_used = fs_info->data_sinfo->bytes_used; + spin_unlock(&fs_info->data_sinfo->lock); + + if (!bytes_used) { + struct btrfs_trans_handle *trans; + int ret; + + trans = btrfs_join_transaction(fs_info->tree_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); + btrfs_end_transaction(trans); + if (ret < 0) + return ret; + return 1; + } + + return 0; +} + +static int insert_balance_item(struct btrfs_fs_info *fs_info, + struct btrfs_balance_control *bctl) +{ + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_trans_handle *trans; + struct btrfs_balance_item *item; + struct btrfs_disk_balance_args disk_bargs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret, err; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_TEMPORARY_ITEM_KEY; + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*item)); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); + + memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); + + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); + btrfs_set_balance_data(leaf, item, &disk_bargs); + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); + btrfs_set_balance_meta(leaf, item, &disk_bargs); + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); + btrfs_set_balance_sys(leaf, item, &disk_bargs); + + btrfs_set_balance_flags(leaf, item, bctl->flags); + + btrfs_mark_buffer_dirty(trans, leaf); +out: + btrfs_free_path(path); + err = btrfs_commit_transaction(trans); + if (err && !ret) + ret = err; + return ret; +} + +static int del_balance_item(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_key key; + int ret, err; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction_fallback_global_rsv(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_TEMPORARY_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + err = btrfs_commit_transaction(trans); + if (err && !ret) + ret = err; + return ret; +} + +/* + * This is a heuristic used to reduce the number of chunks balanced on + * resume after balance was interrupted. + */ +static void update_balance_args(struct btrfs_balance_control *bctl) +{ + /* + * Turn on soft mode for chunk types that were being converted. + */ + if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; + if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; + + /* + * Turn on usage filter if is not already used. The idea is + * that chunks that we have already balanced should be + * reasonably full. Don't do it for chunks that are being + * converted - that will keep us from relocating unconverted + * (albeit full) chunks. + */ + if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && + !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->data.usage = 90; + } + if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && + !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->sys.usage = 90; + } + if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && + !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->meta.usage = 90; + } +} + +/* + * Clear the balance status in fs_info and delete the balance item from disk. + */ +static void reset_balance_state(struct btrfs_fs_info *fs_info) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + int ret; + + BUG_ON(!fs_info->balance_ctl); + + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = NULL; + spin_unlock(&fs_info->balance_lock); + + kfree(bctl); + ret = del_balance_item(fs_info); + if (ret) + btrfs_handle_fs_error(fs_info, ret, NULL); +} + +/* + * Balance filters. Return 1 if chunk should be filtered out + * (should not be balanced). + */ +static int chunk_profiles_filter(u64 chunk_type, + struct btrfs_balance_args *bargs) +{ + chunk_type = chunk_to_extended(chunk_type) & + BTRFS_EXTENDED_PROFILE_MASK; + + if (bargs->profiles & chunk_type) + return 0; + + return 1; +} + +static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + struct btrfs_block_group *cache; + u64 chunk_used; + u64 user_thresh_min; + u64 user_thresh_max; + int ret = 1; + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + chunk_used = cache->used; + + if (bargs->usage_min == 0) + user_thresh_min = 0; + else + user_thresh_min = mult_perc(cache->length, bargs->usage_min); + + if (bargs->usage_max == 0) + user_thresh_max = 1; + else if (bargs->usage_max > 100) + user_thresh_max = cache->length; + else + user_thresh_max = mult_perc(cache->length, bargs->usage_max); + + if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) + ret = 0; + + btrfs_put_block_group(cache); + return ret; +} + +static int chunk_usage_filter(struct btrfs_fs_info *fs_info, + u64 chunk_offset, struct btrfs_balance_args *bargs) +{ + struct btrfs_block_group *cache; + u64 chunk_used, user_thresh; + int ret = 1; + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + chunk_used = cache->used; + + if (bargs->usage_min == 0) + user_thresh = 1; + else if (bargs->usage > 100) + user_thresh = cache->length; + else + user_thresh = mult_perc(cache->length, bargs->usage); + + if (chunk_used < user_thresh) + ret = 0; + + btrfs_put_block_group(cache); + return ret; +} + +static int chunk_devid_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) +{ + struct btrfs_stripe *stripe; + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + int i; + + for (i = 0; i < num_stripes; i++) { + stripe = btrfs_stripe_nr(chunk, i); + if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) + return 0; + } + + return 1; +} + +static u64 calc_data_stripes(u64 type, int num_stripes) +{ + const int index = btrfs_bg_flags_to_raid_index(type); + const int ncopies = btrfs_raid_array[index].ncopies; + const int nparity = btrfs_raid_array[index].nparity; + + return (num_stripes - nparity) / ncopies; +} + +/* [pstart, pend) */ +static int chunk_drange_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) +{ + struct btrfs_stripe *stripe; + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + u64 stripe_offset; + u64 stripe_length; + u64 type; + int factor; + int i; + + if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) + return 0; + + type = btrfs_chunk_type(leaf, chunk); + factor = calc_data_stripes(type, num_stripes); + + for (i = 0; i < num_stripes; i++) { + stripe = btrfs_stripe_nr(chunk, i); + if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) + continue; + + stripe_offset = btrfs_stripe_offset(leaf, stripe); + stripe_length = btrfs_chunk_length(leaf, chunk); + stripe_length = div_u64(stripe_length, factor); + + if (stripe_offset < bargs->pend && + stripe_offset + stripe_length > bargs->pstart) + return 0; + } + + return 1; +} + +/* [vstart, vend) */ +static int chunk_vrange_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + if (chunk_offset < bargs->vend && + chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) + /* at least part of the chunk is inside this vrange */ + return 0; + + return 1; +} + +static int chunk_stripes_range_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) +{ + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + + if (bargs->stripes_min <= num_stripes + && num_stripes <= bargs->stripes_max) + return 0; + + return 1; +} + +static int chunk_soft_convert_filter(u64 chunk_type, + struct btrfs_balance_args *bargs) +{ + if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) + return 0; + + chunk_type = chunk_to_extended(chunk_type) & + BTRFS_EXTENDED_PROFILE_MASK; + + if (bargs->target == chunk_type) + return 1; + + return 0; +} + +static int should_balance_chunk(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 chunk_offset) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + struct btrfs_balance_args *bargs = NULL; + u64 chunk_type = btrfs_chunk_type(leaf, chunk); + + /* type filter */ + if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & + (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { + return 0; + } + + if (chunk_type & BTRFS_BLOCK_GROUP_DATA) + bargs = &bctl->data; + else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) + bargs = &bctl->sys; + else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) + bargs = &bctl->meta; + + /* profiles filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && + chunk_profiles_filter(chunk_type, bargs)) { + return 0; + } + + /* usage filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && + chunk_usage_filter(fs_info, chunk_offset, bargs)) { + return 0; + } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && + chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { + return 0; + } + + /* devid filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && + chunk_devid_filter(leaf, chunk, bargs)) { + return 0; + } + + /* drange filter, makes sense only with devid filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && + chunk_drange_filter(leaf, chunk, bargs)) { + return 0; + } + + /* vrange filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && + chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { + return 0; + } + + /* stripes filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && + chunk_stripes_range_filter(leaf, chunk, bargs)) { + return 0; + } + + /* soft profile changing mode */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && + chunk_soft_convert_filter(chunk_type, bargs)) { + return 0; + } + + /* + * limited by count, must be the last filter + */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { + if (bargs->limit == 0) + return 0; + else + bargs->limit--; + } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { + /* + * Same logic as the 'limit' filter; the minimum cannot be + * determined here because we do not have the global information + * about the count of all chunks that satisfy the filters. + */ + if (bargs->limit_max == 0) + return 0; + else + bargs->limit_max--; + } + + return 1; +} + +static int __btrfs_balance(struct btrfs_fs_info *fs_info) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + struct btrfs_root *chunk_root = fs_info->chunk_root; + u64 chunk_type; + struct btrfs_chunk *chunk; + struct btrfs_path *path = NULL; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *leaf; + int slot; + int ret; + int enospc_errors = 0; + bool counting = true; + /* The single value limit and min/max limits use the same bytes in the */ + u64 limit_data = bctl->data.limit; + u64 limit_meta = bctl->meta.limit; + u64 limit_sys = bctl->sys.limit; + u32 count_data = 0; + u32 count_meta = 0; + u32 count_sys = 0; + int chunk_reserved = 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto error; + } + + /* zero out stat counters */ + spin_lock(&fs_info->balance_lock); + memset(&bctl->stat, 0, sizeof(bctl->stat)); + spin_unlock(&fs_info->balance_lock); +again: + if (!counting) { + /* + * The single value limit and min/max limits use the same bytes + * in the + */ + bctl->data.limit = limit_data; + bctl->meta.limit = limit_meta; + bctl->sys.limit = limit_sys; + } + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + while (1) { + if ((!counting && atomic_read(&fs_info->balance_pause_req)) || + atomic_read(&fs_info->balance_cancel_req)) { + ret = -ECANCELED; + goto error; + } + + mutex_lock(&fs_info->reclaim_bgs_lock); + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); + if (ret < 0) { + mutex_unlock(&fs_info->reclaim_bgs_lock); + goto error; + } + + /* + * this shouldn't happen, it means the last relocate + * failed + */ + if (ret == 0) + BUG(); /* FIXME break ? */ + + ret = btrfs_previous_item(chunk_root, path, 0, + BTRFS_CHUNK_ITEM_KEY); + if (ret) { + mutex_unlock(&fs_info->reclaim_bgs_lock); + ret = 0; + break; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid != key.objectid) { + mutex_unlock(&fs_info->reclaim_bgs_lock); + break; + } + + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + chunk_type = btrfs_chunk_type(leaf, chunk); + + if (!counting) { + spin_lock(&fs_info->balance_lock); + bctl->stat.considered++; + spin_unlock(&fs_info->balance_lock); + } + + ret = should_balance_chunk(leaf, chunk, found_key.offset); + + btrfs_release_path(path); + if (!ret) { + mutex_unlock(&fs_info->reclaim_bgs_lock); + goto loop; + } + + if (counting) { + mutex_unlock(&fs_info->reclaim_bgs_lock); + spin_lock(&fs_info->balance_lock); + bctl->stat.expected++; + spin_unlock(&fs_info->balance_lock); + + if (chunk_type & BTRFS_BLOCK_GROUP_DATA) + count_data++; + else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) + count_sys++; + else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) + count_meta++; + + goto loop; + } + + /* + * Apply limit_min filter, no need to check if the LIMITS + * filter is used, limit_min is 0 by default + */ + if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && + count_data < bctl->data.limit_min) + || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && + count_meta < bctl->meta.limit_min) + || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && + count_sys < bctl->sys.limit_min)) { + mutex_unlock(&fs_info->reclaim_bgs_lock); + goto loop; + } + + if (!chunk_reserved) { + /* + * We may be relocating the only data chunk we have, + * which could potentially end up with losing data's + * raid profile, so lets allocate an empty one in + * advance. + */ + ret = btrfs_may_alloc_data_chunk(fs_info, + found_key.offset); + if (ret < 0) { + mutex_unlock(&fs_info->reclaim_bgs_lock); + goto error; + } else if (ret == 1) { + chunk_reserved = 1; + } + } + + ret = btrfs_relocate_chunk(fs_info, found_key.offset); + mutex_unlock(&fs_info->reclaim_bgs_lock); + if (ret == -ENOSPC) { + enospc_errors++; + } else if (ret == -ETXTBSY) { + btrfs_info(fs_info, + "skipping relocation of block group %llu due to active swapfile", + found_key.offset); + ret = 0; + } else if (ret) { + goto error; + } else { + spin_lock(&fs_info->balance_lock); + bctl->stat.completed++; + spin_unlock(&fs_info->balance_lock); + } +loop: + if (found_key.offset == 0) + break; + key.offset = found_key.offset - 1; + } + + if (counting) { + btrfs_release_path(path); + counting = false; + goto again; + } +error: + btrfs_free_path(path); + if (enospc_errors) { + btrfs_info(fs_info, "%d enospc errors during balance", + enospc_errors); + if (!ret) + ret = -ENOSPC; + } + + return ret; +} + +/* + * See if a given profile is valid and reduced. + * + * @flags: profile to validate + * @extended: if true @flags is treated as an extended profile + */ +static int alloc_profile_is_valid(u64 flags, int extended) +{ + u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : + BTRFS_BLOCK_GROUP_PROFILE_MASK); + + flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; + + /* 1) check that all other bits are zeroed */ + if (flags & ~mask) + return 0; + + /* 2) see if profile is reduced */ + if (flags == 0) + return !extended; /* "0" is valid for usual profiles */ + + return has_single_bit_set(flags); +} + +/* + * Validate target profile against allowed profiles and return true if it's OK. + * Otherwise print the error message and return false. + */ +static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, + const struct btrfs_balance_args *bargs, + u64 allowed, const char *type) +{ + if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) + return true; + + /* Profile is valid and does not have bits outside of the allowed set */ + if (alloc_profile_is_valid(bargs->target, 1) && + (bargs->target & ~allowed) == 0) + return true; + + btrfs_err(fs_info, "balance: invalid convert %s profile %s", + type, btrfs_bg_type_to_raid_name(bargs->target)); + return false; +} + +/* + * Fill @buf with textual description of balance filter flags @bargs, up to + * @size_buf including the terminating null. The output may be trimmed if it + * does not fit into the provided buffer. + */ +static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, + u32 size_buf) +{ + int ret; + u32 size_bp = size_buf; + char *bp = buf; + u64 flags = bargs->flags; + char tmp_buf[128] = {'\0'}; + + if (!flags) + return; + +#define CHECK_APPEND_NOARG(a) \ + do { \ + ret = snprintf(bp, size_bp, (a)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_APPEND_1ARG(a, v1) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_APPEND_2ARG(a, v1, v2) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + + if (flags & BTRFS_BALANCE_ARGS_CONVERT) + CHECK_APPEND_1ARG("convert=%s,", + btrfs_bg_type_to_raid_name(bargs->target)); + + if (flags & BTRFS_BALANCE_ARGS_SOFT) + CHECK_APPEND_NOARG("soft,"); + + if (flags & BTRFS_BALANCE_ARGS_PROFILES) { + btrfs_describe_block_groups(bargs->profiles, tmp_buf, + sizeof(tmp_buf)); + CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); + } + + if (flags & BTRFS_BALANCE_ARGS_USAGE) + CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); + + if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) + CHECK_APPEND_2ARG("usage=%u..%u,", + bargs->usage_min, bargs->usage_max); + + if (flags & BTRFS_BALANCE_ARGS_DEVID) + CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); + + if (flags & BTRFS_BALANCE_ARGS_DRANGE) + CHECK_APPEND_2ARG("drange=%llu..%llu,", + bargs->pstart, bargs->pend); + + if (flags & BTRFS_BALANCE_ARGS_VRANGE) + CHECK_APPEND_2ARG("vrange=%llu..%llu,", + bargs->vstart, bargs->vend); + + if (flags & BTRFS_BALANCE_ARGS_LIMIT) + CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); + + if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) + CHECK_APPEND_2ARG("limit=%u..%u,", + bargs->limit_min, bargs->limit_max); + + if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) + CHECK_APPEND_2ARG("stripes=%u..%u,", + bargs->stripes_min, bargs->stripes_max); + +#undef CHECK_APPEND_2ARG +#undef CHECK_APPEND_1ARG +#undef CHECK_APPEND_NOARG + +out_overflow: + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ + else + buf[0] = '\0'; +} + +static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) +{ + u32 size_buf = 1024; + char tmp_buf[192] = {'\0'}; + char *buf; + char *bp; + u32 size_bp = size_buf; + int ret; + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + buf = kzalloc(size_buf, GFP_KERNEL); + if (!buf) + return; + + bp = buf; + +#define CHECK_APPEND_1ARG(a, v1) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + + if (bctl->flags & BTRFS_BALANCE_FORCE) + CHECK_APPEND_1ARG("%s", "-f "); + + if (bctl->flags & BTRFS_BALANCE_DATA) { + describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-d%s ", tmp_buf); + } + + if (bctl->flags & BTRFS_BALANCE_METADATA) { + describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-m%s ", tmp_buf); + } + + if (bctl->flags & BTRFS_BALANCE_SYSTEM) { + describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-s%s ", tmp_buf); + } + +#undef CHECK_APPEND_1ARG + +out_overflow: + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ + btrfs_info(fs_info, "balance: %s %s", + (bctl->flags & BTRFS_BALANCE_RESUME) ? + "resume" : "start", buf); + + kfree(buf); +} + +/* + * Should be called with balance mutexe held + */ +int btrfs_balance(struct btrfs_fs_info *fs_info, + struct btrfs_balance_control *bctl, + struct btrfs_ioctl_balance_args *bargs) +{ + u64 meta_target, data_target; + u64 allowed; + int mixed = 0; + int ret; + u64 num_devices; + unsigned seq; + bool reducing_redundancy; + bool paused = false; + int i; + + if (btrfs_fs_closing(fs_info) || + atomic_read(&fs_info->balance_pause_req) || + btrfs_should_cancel_balance(fs_info)) { + ret = -EINVAL; + goto out; + } + + allowed = btrfs_super_incompat_flags(fs_info->super_copy); + if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = 1; + + /* + * In case of mixed groups both data and meta should be picked, + * and identical options should be given for both of them. + */ + allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; + if (mixed && (bctl->flags & allowed)) { + if (!(bctl->flags & BTRFS_BALANCE_DATA) || + !(bctl->flags & BTRFS_BALANCE_METADATA) || + memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { + btrfs_err(fs_info, + "balance: mixed groups data and metadata options must be the same"); + ret = -EINVAL; + goto out; + } + } + + /* + * rw_devices will not change at the moment, device add/delete/replace + * are exclusive + */ + num_devices = fs_info->fs_devices->rw_devices; + + /* + * SINGLE profile on-disk has no profile bit, but in-memory we have a + * special bit for it, to make it easier to distinguish. Thus we need + * to set it manually, or balance would refuse the profile. + */ + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) + if (num_devices >= btrfs_raid_array[i].devs_min) + allowed |= btrfs_raid_array[i].bg_flag; + + if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || + !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || + !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { + ret = -EINVAL; + goto out; + } + + /* + * Allow to reduce metadata or system integrity only if force set for + * profiles with redundancy (copies, parity) + */ + allowed = 0; + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { + if (btrfs_raid_array[i].ncopies >= 2 || + btrfs_raid_array[i].tolerated_failures >= 1) + allowed |= btrfs_raid_array[i].bg_flag; + } + do { + seq = read_seqbegin(&fs_info->profiles_lock); + + if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_system_alloc_bits & allowed) && + !(bctl->sys.target & allowed)) || + ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_metadata_alloc_bits & allowed) && + !(bctl->meta.target & allowed))) + reducing_redundancy = true; + else + reducing_redundancy = false; + + /* if we're not converting, the target field is uninitialized */ + meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->meta.target : fs_info->avail_metadata_alloc_bits; + data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->data.target : fs_info->avail_data_alloc_bits; + } while (read_seqretry(&fs_info->profiles_lock, seq)); + + if (reducing_redundancy) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + btrfs_info(fs_info, + "balance: force reducing metadata redundancy"); + } else { + btrfs_err(fs_info, + "balance: reduces metadata redundancy, use --force if you want this"); + ret = -EINVAL; + goto out; + } + } + + if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < + btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { + btrfs_warn(fs_info, + "balance: metadata profile %s has lower redundancy than data profile %s", + btrfs_bg_type_to_raid_name(meta_target), + btrfs_bg_type_to_raid_name(data_target)); + } + + ret = insert_balance_item(fs_info, bctl); + if (ret && ret != -EEXIST) + goto out; + + if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { + BUG_ON(ret == -EEXIST); + BUG_ON(fs_info->balance_ctl); + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = bctl; + spin_unlock(&fs_info->balance_lock); + } else { + BUG_ON(ret != -EEXIST); + spin_lock(&fs_info->balance_lock); + update_balance_args(bctl); + spin_unlock(&fs_info->balance_lock); + } + + ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); + set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); + describe_balance_start_or_resume(fs_info); + mutex_unlock(&fs_info->balance_mutex); + + ret = __btrfs_balance(fs_info); + + mutex_lock(&fs_info->balance_mutex); + if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { + btrfs_info(fs_info, "balance: paused"); + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); + paused = true; + } + /* + * Balance can be canceled by: + * + * - Regular cancel request + * Then ret == -ECANCELED and balance_cancel_req > 0 + * + * - Fatal signal to "btrfs" process + * Either the signal caught by wait_reserve_ticket() and callers + * got -EINTR, or caught by btrfs_should_cancel_balance() and + * got -ECANCELED. + * Either way, in this case balance_cancel_req = 0, and + * ret == -EINTR or ret == -ECANCELED. + * + * So here we only check the return value to catch canceled balance. + */ + else if (ret == -ECANCELED || ret == -EINTR) + btrfs_info(fs_info, "balance: canceled"); + else + btrfs_info(fs_info, "balance: ended with status: %d", ret); + + clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); + + if (bargs) { + memset(bargs, 0, sizeof(*bargs)); + btrfs_update_ioctl_balance_args(fs_info, bargs); + } + + /* We didn't pause, we can clean everything up. */ + if (!paused) { + reset_balance_state(fs_info); + btrfs_exclop_finish(fs_info); + } + + wake_up(&fs_info->balance_wait_q); + + return ret; +out: + if (bctl->flags & BTRFS_BALANCE_RESUME) + reset_balance_state(fs_info); + else + kfree(bctl); + btrfs_exclop_finish(fs_info); + + return ret; +} + +static int balance_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + int ret = 0; + + sb_start_write(fs_info->sb); + mutex_lock(&fs_info->balance_mutex); + if (fs_info->balance_ctl) + ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); + mutex_unlock(&fs_info->balance_mutex); + sb_end_write(fs_info->sb); + + return ret; +} + +int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) +{ + struct task_struct *tsk; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + mutex_unlock(&fs_info->balance_mutex); + return 0; + } + mutex_unlock(&fs_info->balance_mutex); + + if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { + btrfs_info(fs_info, "balance: resume skipped"); + return 0; + } + + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; + spin_unlock(&fs_info->super_lock); + /* + * A ro->rw remount sequence should continue with the paused balance + * regardless of who pauses it, system or the user as of now, so set + * the resume flag. + */ + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; + spin_unlock(&fs_info->balance_lock); + + tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); + return PTR_ERR_OR_ZERO(tsk); +} + +int btrfs_recover_balance(struct btrfs_fs_info *fs_info) +{ + struct btrfs_balance_control *bctl; + struct btrfs_balance_item *item; + struct btrfs_disk_balance_args disk_bargs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_TEMPORARY_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { /* ret = -ENOENT; */ + ret = 0; + goto out; + } + + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + if (!bctl) { + ret = -ENOMEM; + goto out; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); + + bctl->flags = btrfs_balance_flags(leaf, item); + bctl->flags |= BTRFS_BALANCE_RESUME; + + btrfs_balance_data(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); + btrfs_balance_meta(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); + btrfs_balance_sys(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); + + /* + * This should never happen, as the paused balance state is recovered + * during mount without any chance of other exclusive ops to collide. + * + * This gives the exclusive op status to balance and keeps in paused + * state until user intervention (cancel or umount). If the ownership + * cannot be assigned, show a message but do not fail. The balance + * is in a paused state and must have fs_info::balance_ctl properly + * set up. + */ + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED)) + btrfs_warn(fs_info, + "balance: cannot set exclusive op status, resume manually"); + + btrfs_release_path(path); + + mutex_lock(&fs_info->balance_mutex); + BUG_ON(fs_info->balance_ctl); + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = bctl; + spin_unlock(&fs_info->balance_lock); + mutex_unlock(&fs_info->balance_mutex); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_pause_balance(struct btrfs_fs_info *fs_info) +{ + int ret = 0; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + mutex_unlock(&fs_info->balance_mutex); + return -ENOTCONN; + } + + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + atomic_inc(&fs_info->balance_pause_req); + mutex_unlock(&fs_info->balance_mutex); + + wait_event(fs_info->balance_wait_q, + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); + + mutex_lock(&fs_info->balance_mutex); + /* we are good with balance_ctl ripped off from under us */ + BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); + atomic_dec(&fs_info->balance_pause_req); + } else { + ret = -ENOTCONN; + } + + mutex_unlock(&fs_info->balance_mutex); + return ret; +} + +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) +{ + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + mutex_unlock(&fs_info->balance_mutex); + return -ENOTCONN; + } + + /* + * A paused balance with the item stored on disk can be resumed at + * mount time if the mount is read-write. Otherwise it's still paused + * and we must not allow cancelling as it deletes the item. + */ + if (sb_rdonly(fs_info->sb)) { + mutex_unlock(&fs_info->balance_mutex); + return -EROFS; + } + + atomic_inc(&fs_info->balance_cancel_req); + /* + * if we are running just wait and return, balance item is + * deleted in btrfs_balance in this case + */ + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + mutex_unlock(&fs_info->balance_mutex); + wait_event(fs_info->balance_wait_q, + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); + mutex_lock(&fs_info->balance_mutex); + } else { + mutex_unlock(&fs_info->balance_mutex); + /* + * Lock released to allow other waiters to continue, we'll + * reexamine the status again. + */ + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl) { + reset_balance_state(fs_info); + btrfs_exclop_finish(fs_info); + btrfs_info(fs_info, "balance: canceled"); + } + } + + ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); + atomic_dec(&fs_info->balance_cancel_req); + mutex_unlock(&fs_info->balance_mutex); + return 0; +} + +int btrfs_uuid_scan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_key key; + struct btrfs_path *path = NULL; + int ret = 0; + struct extent_buffer *eb; + int slot; + struct btrfs_root_item root_item; + u32 item_size; + struct btrfs_trans_handle *trans = NULL; + bool closing = false; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; + + while (1) { + if (btrfs_fs_closing(fs_info)) { + closing = true; + break; + } + ret = btrfs_search_forward(root, &key, path, + BTRFS_OLDEST_GENERATION); + if (ret) { + if (ret > 0) + ret = 0; + break; + } + + if (key.type != BTRFS_ROOT_ITEM_KEY || + (key.objectid < BTRFS_FIRST_FREE_OBJECTID && + key.objectid != BTRFS_FS_TREE_OBJECTID) || + key.objectid > BTRFS_LAST_FREE_OBJECTID) + goto skip; + + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size(eb, slot); + if (item_size < sizeof(root_item)) + goto skip; + + read_extent_buffer(eb, &root_item, + btrfs_item_ptr_offset(eb, slot), + (int)sizeof(root_item)); + if (btrfs_root_refs(&root_item) == 0) + goto skip; + + if (!btrfs_is_empty_uuid(root_item.uuid) || + !btrfs_is_empty_uuid(root_item.received_uuid)) { + if (trans) + goto update_tree; + + btrfs_release_path(path); + /* + * 1 - subvol uuid item + * 1 - received_subvol uuid item + */ + trans = btrfs_start_transaction(fs_info->uuid_root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + continue; + } else { + goto skip; + } +update_tree: + btrfs_release_path(path); + if (!btrfs_is_empty_uuid(root_item.uuid)) { + ret = btrfs_uuid_tree_add(trans, root_item.uuid, + BTRFS_UUID_KEY_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + + if (!btrfs_is_empty_uuid(root_item.received_uuid)) { + ret = btrfs_uuid_tree_add(trans, + root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + +skip: + btrfs_release_path(path); + if (trans) { + ret = btrfs_end_transaction(trans); + trans = NULL; + if (ret) + break; + } + + if (key.offset < (u64)-1) { + key.offset++; + } else if (key.type < BTRFS_ROOT_ITEM_KEY) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + } else if (key.objectid < (u64)-1) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.objectid++; + } else { + break; + } + cond_resched(); + } + +out: + btrfs_free_path(path); + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans); + if (ret) + btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); + else if (!closing) + set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); + up(&fs_info->uuid_tree_rescan_sem); + return 0; +} + +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *uuid_root; + struct task_struct *task; + int ret; + + /* + * 1 - root node + * 1 - root item + */ + trans = btrfs_start_transaction(tree_root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); + if (IS_ERR(uuid_root)) { + ret = PTR_ERR(uuid_root); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } + + fs_info->uuid_root = uuid_root; + + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + btrfs_warn(fs_info, "failed to start uuid_scan task"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} + +/* + * shrinking a device means finding all of the device extents past + * the new size, and then following the back refs to the chunks. + * The chunk relocation code actually frees the device extent + */ +int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) +{ + struct btrfs_fs_info *fs_info = device->fs_info; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_trans_handle *trans; + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_path *path; + u64 length; + u64 chunk_offset; + int ret; + int slot; + int failed = 0; + bool retried = false; + struct extent_buffer *l; + struct btrfs_key key; + struct btrfs_super_block *super_copy = fs_info->super_copy; + u64 old_total = btrfs_super_total_bytes(super_copy); + u64 old_size = btrfs_device_get_total_bytes(device); + u64 diff; + u64 start; + + new_size = round_down(new_size, fs_info->sectorsize); + start = new_size; + diff = round_down(old_size - new_size, fs_info->sectorsize); + + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) + return -EINVAL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = READA_BACK; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + mutex_lock(&fs_info->chunk_mutex); + + btrfs_device_set_total_bytes(device, new_size); + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { + device->fs_devices->total_rw_bytes -= diff; + atomic64_sub(diff, &fs_info->free_chunk_space); + } + + /* + * Once the device's size has been set to the new size, ensure all + * in-memory chunks are synced to disk so that the loop below sees them + * and relocates them accordingly. + */ + if (contains_pending_extent(device, &start, diff)) { + mutex_unlock(&fs_info->chunk_mutex); + ret = btrfs_commit_transaction(trans); + if (ret) + goto done; + } else { + mutex_unlock(&fs_info->chunk_mutex); + btrfs_end_transaction(trans); + } + +again: + key.objectid = device->devid; + key.offset = (u64)-1; + key.type = BTRFS_DEV_EXTENT_KEY; + + do { + mutex_lock(&fs_info->reclaim_bgs_lock); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + mutex_unlock(&fs_info->reclaim_bgs_lock); + goto done; + } + + ret = btrfs_previous_item(root, path, 0, key.type); + if (ret) { + mutex_unlock(&fs_info->reclaim_bgs_lock); + if (ret < 0) + goto done; + ret = 0; + btrfs_release_path(path); + break; + } + + l = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + + if (key.objectid != device->devid) { + mutex_unlock(&fs_info->reclaim_bgs_lock); + btrfs_release_path(path); + break; + } + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + length = btrfs_dev_extent_length(l, dev_extent); + + if (key.offset + length <= new_size) { + mutex_unlock(&fs_info->reclaim_bgs_lock); + btrfs_release_path(path); + break; + } + + chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); + btrfs_release_path(path); + + /* + * We may be relocating the only data chunk we have, + * which could potentially end up with losing data's + * raid profile, so lets allocate an empty one in + * advance. + */ + ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); + if (ret < 0) { + mutex_unlock(&fs_info->reclaim_bgs_lock); + goto done; + } + + ret = btrfs_relocate_chunk(fs_info, chunk_offset); + mutex_unlock(&fs_info->reclaim_bgs_lock); + if (ret == -ENOSPC) { + failed++; + } else if (ret) { + if (ret == -ETXTBSY) { + btrfs_warn(fs_info, + "could not shrink block group %llu due to active swapfile", + chunk_offset); + } + goto done; + } + } while (key.offset-- > 0); + + if (failed && !retried) { + failed = 0; + retried = true; + goto again; + } else if (failed && retried) { + ret = -ENOSPC; + goto done; + } + + /* Shrinking succeeded, else we would be at "done". */ + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto done; + } + + mutex_lock(&fs_info->chunk_mutex); + /* Clear all state bits beyond the shrunk device size */ + clear_extent_bits(&device->alloc_state, new_size, (u64)-1, + CHUNK_STATE_MASK); + + btrfs_device_set_disk_total_bytes(device, new_size); + if (list_empty(&device->post_commit_list)) + list_add_tail(&device->post_commit_list, + &trans->transaction->dev_update_list); + + WARN_ON(diff > old_total); + btrfs_set_super_total_bytes(super_copy, + round_down(old_total - diff, fs_info->sectorsize)); + mutex_unlock(&fs_info->chunk_mutex); + + btrfs_reserve_chunk_metadata(trans, false); + /* Now btrfs_update_device() will change the on-disk size. */ + ret = btrfs_update_device(trans, device); + btrfs_trans_release_chunk_metadata(trans); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + } else { + ret = btrfs_commit_transaction(trans); + } +done: + btrfs_free_path(path); + if (ret) { + mutex_lock(&fs_info->chunk_mutex); + btrfs_device_set_total_bytes(device, old_size); + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + device->fs_devices->total_rw_bytes += diff; + atomic64_add(diff, &fs_info->free_chunk_space); + mutex_unlock(&fs_info->chunk_mutex); + } + return ret; +} + +static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, + struct btrfs_key *key, + struct btrfs_chunk *chunk, int item_size) +{ + struct btrfs_super_block *super_copy = fs_info->super_copy; + struct btrfs_disk_key disk_key; + u32 array_size; + u8 *ptr; + + lockdep_assert_held(&fs_info->chunk_mutex); + + array_size = btrfs_super_sys_array_size(super_copy); + if (array_size + item_size + sizeof(disk_key) + > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) + return -EFBIG; + + ptr = super_copy->sys_chunk_array + array_size; + btrfs_cpu_key_to_disk(&disk_key, key); + memcpy(ptr, &disk_key, sizeof(disk_key)); + ptr += sizeof(disk_key); + memcpy(ptr, chunk, item_size); + item_size += sizeof(disk_key); + btrfs_set_super_sys_array_size(super_copy, array_size + item_size); + + return 0; +} + +/* + * sort the devices in descending order by max_avail, total_avail + */ +static int btrfs_cmp_device_info(const void *a, const void *b) +{ + const struct btrfs_device_info *di_a = a; + const struct btrfs_device_info *di_b = b; + + if (di_a->max_avail > di_b->max_avail) + return -1; + if (di_a->max_avail < di_b->max_avail) + return 1; + if (di_a->total_avail > di_b->total_avail) + return -1; + if (di_a->total_avail < di_b->total_avail) + return 1; + return 0; +} + +static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) +{ + if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) + return; + + btrfs_set_fs_incompat(info, RAID56); +} + +static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) +{ + if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) + return; + + btrfs_set_fs_incompat(info, RAID1C34); +} + +/* + * Structure used internally for btrfs_create_chunk() function. + * Wraps needed parameters. + */ +struct alloc_chunk_ctl { + u64 start; + u64 type; + /* Total number of stripes to allocate */ + int num_stripes; + /* sub_stripes info for map */ + int sub_stripes; + /* Stripes per device */ + int dev_stripes; + /* Maximum number of devices to use */ + int devs_max; + /* Minimum number of devices to use */ + int devs_min; + /* ndevs has to be a multiple of this */ + int devs_increment; + /* Number of copies */ + int ncopies; + /* Number of stripes worth of bytes to store parity information */ + int nparity; + u64 max_stripe_size; + u64 max_chunk_size; + u64 dev_extent_min; + u64 stripe_size; + u64 chunk_size; + int ndevs; +}; + +static void init_alloc_chunk_ctl_policy_regular( + struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl) +{ + struct btrfs_space_info *space_info; + + space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type); + ASSERT(space_info); + + ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); + ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G); + + if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) + ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); + + /* We don't want a chunk larger than 10% of writable space */ + ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), + ctl->max_chunk_size); + ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes); +} + +static void init_alloc_chunk_ctl_policy_zoned( + struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl) +{ + u64 zone_size = fs_devices->fs_info->zone_size; + u64 limit; + int min_num_stripes = ctl->devs_min * ctl->dev_stripes; + int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; + u64 min_chunk_size = min_data_stripes * zone_size; + u64 type = ctl->type; + + ctl->max_stripe_size = zone_size; + if (type & BTRFS_BLOCK_GROUP_DATA) { + ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, + zone_size); + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { + ctl->max_chunk_size = ctl->max_stripe_size; + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { + ctl->max_chunk_size = 2 * ctl->max_stripe_size; + ctl->devs_max = min_t(int, ctl->devs_max, + BTRFS_MAX_DEVS_SYS_CHUNK); + } else { + BUG(); + } + + /* We don't want a chunk larger than 10% of writable space */ + limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10), + zone_size), + min_chunk_size); + ctl->max_chunk_size = min(limit, ctl->max_chunk_size); + ctl->dev_extent_min = zone_size * ctl->dev_stripes; +} + +static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl) +{ + int index = btrfs_bg_flags_to_raid_index(ctl->type); + + ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; + ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; + ctl->devs_max = btrfs_raid_array[index].devs_max; + if (!ctl->devs_max) + ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); + ctl->devs_min = btrfs_raid_array[index].devs_min; + ctl->devs_increment = btrfs_raid_array[index].devs_increment; + ctl->ncopies = btrfs_raid_array[index].ncopies; + ctl->nparity = btrfs_raid_array[index].nparity; + ctl->ndevs = 0; + + switch (fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); + break; + case BTRFS_CHUNK_ALLOC_ZONED: + init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); + break; + default: + BUG(); + } +} + +static int gather_device_info(struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + struct btrfs_fs_info *info = fs_devices->fs_info; + struct btrfs_device *device; + u64 total_avail; + u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; + int ret; + int ndevs = 0; + u64 max_avail; + u64 dev_offset; + + /* + * in the first pass through the devices list, we gather information + * about the available holes on each device. + */ + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { + WARN(1, KERN_ERR + "BTRFS: read-only device in alloc_list\n"); + continue; + } + + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &device->dev_state) || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) + continue; + + if (device->total_bytes > device->bytes_used) + total_avail = device->total_bytes - device->bytes_used; + else + total_avail = 0; + + /* If there is no space on this device, skip it. */ + if (total_avail < ctl->dev_extent_min) + continue; + + ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, + &max_avail); + if (ret && ret != -ENOSPC) + return ret; + + if (ret == 0) + max_avail = dev_extent_want; + + if (max_avail < ctl->dev_extent_min) { + if (btrfs_test_opt(info, ENOSPC_DEBUG)) + btrfs_debug(info, + "%s: devid %llu has no free space, have=%llu want=%llu", + __func__, device->devid, max_avail, + ctl->dev_extent_min); + continue; + } + + if (ndevs == fs_devices->rw_devices) { + WARN(1, "%s: found more than %llu devices\n", + __func__, fs_devices->rw_devices); + break; + } + devices_info[ndevs].dev_offset = dev_offset; + devices_info[ndevs].max_avail = max_avail; + devices_info[ndevs].total_avail = total_avail; + devices_info[ndevs].dev = device; + ++ndevs; + } + ctl->ndevs = ndevs; + + /* + * now sort the devices by hole size / available space + */ + sort(devices_info, ndevs, sizeof(struct btrfs_device_info), + btrfs_cmp_device_info, NULL); + + return 0; +} + +static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + /* Number of stripes that count for block group size */ + int data_stripes; + + /* + * The primary goal is to maximize the number of stripes, so use as + * many devices as possible, even if the stripes are not maximum sized. + * + * The DUP profile stores more than one stripe per device, the + * max_avail is the total size so we have to adjust. + */ + ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, + ctl->dev_stripes); + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; + + /* This will have to be fixed for RAID1 and RAID10 over more drives */ + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; + + /* + * Use the number of data stripes to figure out how big this chunk is + * really going to be in terms of logical address space, and compare + * that answer with the max chunk size. If it's higher, we try to + * reduce stripe_size. + */ + if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { + /* + * Reduce stripe_size, round it up to a 16MB boundary again and + * then use it, unless it ends up being even bigger than the + * previous value we had already. + */ + ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, + data_stripes), SZ_16M), + ctl->stripe_size); + } + + /* Stripe size should not go beyond 1G. */ + ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G); + + /* Align to BTRFS_STRIPE_LEN */ + ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); + ctl->chunk_size = ctl->stripe_size * data_stripes; + + return 0; +} + +static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + u64 zone_size = devices_info[0].dev->zone_info->zone_size; + /* Number of stripes that count for block group size */ + int data_stripes; + + /* + * It should hold because: + * dev_extent_min == dev_extent_want == zone_size * dev_stripes + */ + ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); + + ctl->stripe_size = zone_size; + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; + + /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ + if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { + ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, + ctl->stripe_size) + ctl->nparity, + ctl->dev_stripes); + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; + ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); + } + + ctl->chunk_size = ctl->stripe_size * data_stripes; + + return 0; +} + +static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + struct btrfs_fs_info *info = fs_devices->fs_info; + + /* + * Round down to number of usable stripes, devs_increment can be any + * number so we can't use round_down() that requires power of 2, while + * rounddown is safe. + */ + ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); + + if (ctl->ndevs < ctl->devs_min) { + if (btrfs_test_opt(info, ENOSPC_DEBUG)) { + btrfs_debug(info, + "%s: not enough devices with free space: have=%d minimum required=%d", + __func__, ctl->ndevs, ctl->devs_min); + } + return -ENOSPC; + } + + ctl->ndevs = min(ctl->ndevs, ctl->devs_max); + + switch (fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + return decide_stripe_size_regular(ctl, devices_info); + case BTRFS_CHUNK_ALLOC_ZONED: + return decide_stripe_size_zoned(ctl, devices_info); + default: + BUG(); + } +} + +static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, + struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + struct btrfs_fs_info *info = trans->fs_info; + struct map_lookup *map = NULL; + struct extent_map_tree *em_tree; + struct btrfs_block_group *block_group; + struct extent_map *em; + u64 start = ctl->start; + u64 type = ctl->type; + int ret; + int i; + int j; + + map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); + if (!map) + return ERR_PTR(-ENOMEM); + map->num_stripes = ctl->num_stripes; + + for (i = 0; i < ctl->ndevs; ++i) { + for (j = 0; j < ctl->dev_stripes; ++j) { + int s = i * ctl->dev_stripes + j; + map->stripes[s].dev = devices_info[i].dev; + map->stripes[s].physical = devices_info[i].dev_offset + + j * ctl->stripe_size; + } + } + map->io_align = BTRFS_STRIPE_LEN; + map->io_width = BTRFS_STRIPE_LEN; + map->type = type; + map->sub_stripes = ctl->sub_stripes; + + trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); + + em = alloc_extent_map(); + if (!em) { + kfree(map); + return ERR_PTR(-ENOMEM); + } + set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); + em->map_lookup = map; + em->start = start; + em->len = ctl->chunk_size; + em->block_start = 0; + em->block_len = em->len; + em->orig_block_len = ctl->stripe_size; + + em_tree = &info->mapping_tree; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + if (ret) { + write_unlock(&em_tree->lock); + free_extent_map(em); + return ERR_PTR(ret); + } + write_unlock(&em_tree->lock); + + block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size); + if (IS_ERR(block_group)) + goto error_del_extent; + + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *dev = map->stripes[i].dev; + + btrfs_device_set_bytes_used(dev, + dev->bytes_used + ctl->stripe_size); + if (list_empty(&dev->post_commit_list)) + list_add_tail(&dev->post_commit_list, + &trans->transaction->dev_update_list); + } + + atomic64_sub(ctl->stripe_size * map->num_stripes, + &info->free_chunk_space); + + free_extent_map(em); + check_raid56_incompat_flag(info, type); + check_raid1c34_incompat_flag(info, type); + + return block_group; + +error_del_extent: + write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + + /* One for our allocation */ + free_extent_map(em); + /* One for the tree reference */ + free_extent_map(em); + + return block_group; +} + +struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, + u64 type) +{ + struct btrfs_fs_info *info = trans->fs_info; + struct btrfs_fs_devices *fs_devices = info->fs_devices; + struct btrfs_device_info *devices_info = NULL; + struct alloc_chunk_ctl ctl; + struct btrfs_block_group *block_group; + int ret; + + lockdep_assert_held(&info->chunk_mutex); + + if (!alloc_profile_is_valid(type, 0)) { + ASSERT(0); + return ERR_PTR(-EINVAL); + } + + if (list_empty(&fs_devices->alloc_list)) { + if (btrfs_test_opt(info, ENOSPC_DEBUG)) + btrfs_debug(info, "%s: no writable device", __func__); + return ERR_PTR(-ENOSPC); + } + + if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(info, "invalid chunk type 0x%llx requested", type); + ASSERT(0); + return ERR_PTR(-EINVAL); + } + + ctl.start = find_next_chunk(info); + ctl.type = type; + init_alloc_chunk_ctl(fs_devices, &ctl); + + devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), + GFP_NOFS); + if (!devices_info) + return ERR_PTR(-ENOMEM); + + ret = gather_device_info(fs_devices, &ctl, devices_info); + if (ret < 0) { + block_group = ERR_PTR(ret); + goto out; + } + + ret = decide_stripe_size(fs_devices, &ctl, devices_info); + if (ret < 0) { + block_group = ERR_PTR(ret); + goto out; + } + + block_group = create_chunk(trans, &ctl, devices_info); + +out: + kfree(devices_info); + return block_group; +} + +/* + * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the + * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system + * chunks. + * + * See the comment at btrfs_chunk_alloc() for details about the chunk allocation + * phases. + */ +int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, + struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *chunk_root = fs_info->chunk_root; + struct btrfs_key key; + struct btrfs_chunk *chunk; + struct btrfs_stripe *stripe; + struct extent_map *em; + struct map_lookup *map; + size_t item_size; + int i; + int ret; + + /* + * We take the chunk_mutex for 2 reasons: + * + * 1) Updates and insertions in the chunk btree must be done while holding + * the chunk_mutex, as well as updating the system chunk array in the + * superblock. See the comment on top of btrfs_chunk_alloc() for the + * details; + * + * 2) To prevent races with the final phase of a device replace operation + * that replaces the device object associated with the map's stripes, + * because the device object's id can change at any time during that + * final phase of the device replace operation + * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the + * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, + * which would cause a failure when updating the device item, which does + * not exists, or persisting a stripe of the chunk item with such ID. + * Here we can't use the device_list_mutex because our caller already + * has locked the chunk_mutex, and the final phase of device replace + * acquires both mutexes - first the device_list_mutex and then the + * chunk_mutex. Using any of those two mutexes protects us from a + * concurrent device replace. + */ + lockdep_assert_held(&fs_info->chunk_mutex); + + em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + btrfs_abort_transaction(trans, ret); + return ret; + } + + map = em->map_lookup; + item_size = btrfs_chunk_item_size(map->num_stripes); + + chunk = kzalloc(item_size, GFP_NOFS); + if (!chunk) { + ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); + goto out; + } + + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *device = map->stripes[i].dev; + + ret = btrfs_update_device(trans, device); + if (ret) + goto out; + } + + stripe = &chunk->stripe; + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *device = map->stripes[i].dev; + const u64 dev_offset = map->stripes[i].physical; + + btrfs_set_stack_stripe_devid(stripe, device->devid); + btrfs_set_stack_stripe_offset(stripe, dev_offset); + memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); + stripe++; + } + + btrfs_set_stack_chunk_length(chunk, bg->length); + btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); + btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN); + btrfs_set_stack_chunk_type(chunk, map->type); + btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); + btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN); + btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN); + btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); + btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = bg->start; + + ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); + if (ret) + goto out; + + set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags); + + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); + if (ret) + goto out; + } + +out: + kfree(chunk); + free_extent_map(em); + return ret; +} + +static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + u64 alloc_profile; + struct btrfs_block_group *meta_bg; + struct btrfs_block_group *sys_bg; + + /* + * When adding a new device for sprouting, the seed device is read-only + * so we must first allocate a metadata and a system chunk. But before + * adding the block group items to the extent, device and chunk btrees, + * we must first: + * + * 1) Create both chunks without doing any changes to the btrees, as + * otherwise we would get -ENOSPC since the block groups from the + * seed device are read-only; + * + * 2) Add the device item for the new sprout device - finishing the setup + * of a new block group requires updating the device item in the chunk + * btree, so it must exist when we attempt to do it. The previous step + * ensures this does not fail with -ENOSPC. + * + * After that we can add the block group items to their btrees: + * update existing device item in the chunk btree, add a new block group + * item to the extent btree, add a new chunk item to the chunk btree and + * finally add the new device extent items to the devices btree. + */ + + alloc_profile = btrfs_metadata_alloc_profile(fs_info); + meta_bg = btrfs_create_chunk(trans, alloc_profile); + if (IS_ERR(meta_bg)) + return PTR_ERR(meta_bg); + + alloc_profile = btrfs_system_alloc_profile(fs_info); + sys_bg = btrfs_create_chunk(trans, alloc_profile); + if (IS_ERR(sys_bg)) + return PTR_ERR(sys_bg); + + return 0; +} + +static inline int btrfs_chunk_max_errors(struct map_lookup *map) +{ + const int index = btrfs_bg_flags_to_raid_index(map->type); + + return btrfs_raid_array[index].tolerated_failures; +} + +bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) +{ + struct extent_map *em; + struct map_lookup *map; + int miss_ndevs = 0; + int i; + bool ret = true; + + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(em)) + return false; + + map = em->map_lookup; + for (i = 0; i < map->num_stripes; i++) { + if (test_bit(BTRFS_DEV_STATE_MISSING, + &map->stripes[i].dev->dev_state)) { + miss_ndevs++; + continue; + } + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, + &map->stripes[i].dev->dev_state)) { + ret = false; + goto end; + } + } + + /* + * If the number of missing devices is larger than max errors, we can + * not write the data into that chunk successfully. + */ + if (miss_ndevs > btrfs_chunk_max_errors(map)) + ret = false; +end: + free_extent_map(em); + return ret; +} + +void btrfs_mapping_tree_free(struct extent_map_tree *tree) +{ + struct extent_map *em; + + while (1) { + write_lock(&tree->lock); + em = lookup_extent_mapping(tree, 0, (u64)-1); + if (em) + remove_extent_mapping(tree, em); + write_unlock(&tree->lock); + if (!em) + break; + /* once for us */ + free_extent_map(em); + /* once for the tree */ + free_extent_map(em); + } +} + +int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) +{ + struct extent_map *em; + struct map_lookup *map; + enum btrfs_raid_types index; + int ret = 1; + + em = btrfs_get_chunk_map(fs_info, logical, len); + if (IS_ERR(em)) + /* + * We could return errors for these cases, but that could get + * ugly and we'd probably do the same thing which is just not do + * anything else and exit, so return 1 so the callers don't try + * to use other copies. + */ + return 1; + + map = em->map_lookup; + index = btrfs_bg_flags_to_raid_index(map->type); + + /* Non-RAID56, use their ncopies from btrfs_raid_array. */ + if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)) + ret = btrfs_raid_array[index].ncopies; + else if (map->type & BTRFS_BLOCK_GROUP_RAID5) + ret = 2; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + /* + * There could be two corrupted data stripes, we need + * to loop retry in order to rebuild the correct data. + * + * Fail a stripe at a time on every retry except the + * stripe under reconstruction. + */ + ret = map->num_stripes; + free_extent_map(em); + return ret; +} + +unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, + u64 logical) +{ + struct extent_map *em; + struct map_lookup *map; + unsigned long len = fs_info->sectorsize; + + if (!btrfs_fs_incompat(fs_info, RAID56)) + return len; + + em = btrfs_get_chunk_map(fs_info, logical, len); + + if (!WARN_ON(IS_ERR(em))) { + map = em->map_lookup; + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) + len = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); + free_extent_map(em); + } + return len; +} + +int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) +{ + struct extent_map *em; + struct map_lookup *map; + int ret = 0; + + if (!btrfs_fs_incompat(fs_info, RAID56)) + return 0; + + em = btrfs_get_chunk_map(fs_info, logical, len); + + if(!WARN_ON(IS_ERR(em))) { + map = em->map_lookup; + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) + ret = 1; + free_extent_map(em); + } + return ret; +} + +static int find_live_mirror(struct btrfs_fs_info *fs_info, + struct map_lookup *map, int first, + int dev_replace_is_ongoing) +{ + int i; + int num_stripes; + int preferred_mirror; + int tolerance; + struct btrfs_device *srcdev; + + ASSERT((map->type & + (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); + + if (map->type & BTRFS_BLOCK_GROUP_RAID10) + num_stripes = map->sub_stripes; + else + num_stripes = map->num_stripes; + + switch (fs_info->fs_devices->read_policy) { + default: + /* Shouldn't happen, just warn and use pid instead of failing */ + btrfs_warn_rl(fs_info, + "unknown read_policy type %u, reset to pid", + fs_info->fs_devices->read_policy); + fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; + fallthrough; + case BTRFS_READ_POLICY_PID: + preferred_mirror = first + (current->pid % num_stripes); + break; + } + + if (dev_replace_is_ongoing && + fs_info->dev_replace.cont_reading_from_srcdev_mode == + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) + srcdev = fs_info->dev_replace.srcdev; + else + srcdev = NULL; + + /* + * try to avoid the drive that is the source drive for a + * dev-replace procedure, only choose it if no other non-missing + * mirror is available + */ + for (tolerance = 0; tolerance < 2; tolerance++) { + if (map->stripes[preferred_mirror].dev->bdev && + (tolerance || map->stripes[preferred_mirror].dev != srcdev)) + return preferred_mirror; + for (i = first; i < first + num_stripes; i++) { + if (map->stripes[i].dev->bdev && + (tolerance || map->stripes[i].dev != srcdev)) + return i; + } + } + + /* we couldn't find one that doesn't fail. Just return something + * and the io error handling code will clean up eventually + */ + return preferred_mirror; +} + +static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + u16 total_stripes) +{ + struct btrfs_io_context *bioc; + + bioc = kzalloc( + /* The size of btrfs_io_context */ + sizeof(struct btrfs_io_context) + + /* Plus the variable array for the stripes */ + sizeof(struct btrfs_io_stripe) * (total_stripes), + GFP_NOFS); + + if (!bioc) + return NULL; + + refcount_set(&bioc->refs, 1); + + bioc->fs_info = fs_info; + bioc->replace_stripe_src = -1; + bioc->full_stripe_logical = (u64)-1; + + return bioc; +} + +void btrfs_get_bioc(struct btrfs_io_context *bioc) +{ + WARN_ON(!refcount_read(&bioc->refs)); + refcount_inc(&bioc->refs); +} + +void btrfs_put_bioc(struct btrfs_io_context *bioc) +{ + if (!bioc) + return; + if (refcount_dec_and_test(&bioc->refs)) + kfree(bioc); +} + +/* + * Please note that, discard won't be sent to target device of device + * replace. + */ +struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length_ret, + u32 *num_stripes) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_discard_stripe *stripes; + u64 length = *length_ret; + u64 offset; + u32 stripe_nr; + u32 stripe_nr_end; + u32 stripe_cnt; + u64 stripe_end_offset; + u64 stripe_offset; + u32 stripe_index; + u32 factor = 0; + u32 sub_stripes = 0; + u32 stripes_per_dev = 0; + u32 remaining_stripes = 0; + u32 last_stripe = 0; + int ret; + int i; + + em = btrfs_get_chunk_map(fs_info, logical, length); + if (IS_ERR(em)) + return ERR_CAST(em); + + map = em->map_lookup; + + /* we don't discard raid56 yet */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = -EOPNOTSUPP; + goto out_free_map; + } + + offset = logical - em->start; + length = min_t(u64, em->start + em->len - logical, length); + *length_ret = length; + + /* + * stripe_nr counts the total number of stripes we have to stride + * to get to this block + */ + stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; + + /* stripe_offset is the offset of this block in its stripe */ + stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr); + + stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >> + BTRFS_STRIPE_LEN_SHIFT; + stripe_cnt = stripe_nr_end - stripe_nr; + stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) - + (offset + length); + /* + * after this, stripe_nr is the number of stripes on this + * device we have to walk to find the data, and stripe_index is + * the number of our device in the stripe array + */ + *num_stripes = 1; + stripe_index = 0; + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID0) + sub_stripes = 1; + else + sub_stripes = map->sub_stripes; + + factor = map->num_stripes / sub_stripes; + *num_stripes = min_t(u64, map->num_stripes, + sub_stripes * stripe_cnt); + stripe_index = stripe_nr % factor; + stripe_nr /= factor; + stripe_index *= sub_stripes; + + remaining_stripes = stripe_cnt % factor; + stripes_per_dev = stripe_cnt / factor; + last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes; + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | + BTRFS_BLOCK_GROUP_DUP)) { + *num_stripes = map->num_stripes; + } else { + stripe_index = stripe_nr % map->num_stripes; + stripe_nr /= map->num_stripes; + } + + stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS); + if (!stripes) { + ret = -ENOMEM; + goto out_free_map; + } + + for (i = 0; i < *num_stripes; i++) { + stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); + stripes[i].dev = map->stripes[stripe_index].dev; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev); + + if (i / sub_stripes < remaining_stripes) + stripes[i].length += BTRFS_STRIPE_LEN; + + /* + * Special for the first stripe and + * the last stripe: + * + * |-------|...|-------| + * |----------| + * off end_off + */ + if (i < sub_stripes) + stripes[i].length -= stripe_offset; + + if (stripe_index >= last_stripe && + stripe_index <= (last_stripe + + sub_stripes - 1)) + stripes[i].length -= stripe_end_offset; + + if (i == sub_stripes - 1) + stripe_offset = 0; + } else { + stripes[i].length = length; + } + + stripe_index++; + if (stripe_index == map->num_stripes) { + stripe_index = 0; + stripe_nr++; + } + } + + free_extent_map(em); + return stripes; +out_free_map: + free_extent_map(em); + return ERR_PTR(ret); +} + +static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) +{ + struct btrfs_block_group *cache; + bool ret; + + /* Non zoned filesystem does not use "to_copy" flag */ + if (!btrfs_is_zoned(fs_info)) + return false; + + cache = btrfs_lookup_block_group(fs_info, logical); + + ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); + + btrfs_put_block_group(cache); + return ret; +} + +static void handle_ops_on_dev_replace(enum btrfs_map_op op, + struct btrfs_io_context *bioc, + struct btrfs_dev_replace *dev_replace, + u64 logical, + int *num_stripes_ret, int *max_errors_ret) +{ + u64 srcdev_devid = dev_replace->srcdev->devid; + /* + * At this stage, num_stripes is still the real number of stripes, + * excluding the duplicated stripes. + */ + int num_stripes = *num_stripes_ret; + int nr_extra_stripes = 0; + int max_errors = *max_errors_ret; + int i; + + /* + * A block group which has "to_copy" set will eventually be copied by + * the dev-replace process. We can avoid cloning IO here. + */ + if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) + return; + + /* + * Duplicate the write operations while the dev-replace procedure is + * running. Since the copying of the old disk to the new disk takes + * place at run time while the filesystem is mounted writable, the + * regular write operations to the old disk have to be duplicated to go + * to the new disk as well. + * + * Note that device->missing is handled by the caller, and that the + * write to the old disk is already set up in the stripes array. + */ + for (i = 0; i < num_stripes; i++) { + struct btrfs_io_stripe *old = &bioc->stripes[i]; + struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes]; + + if (old->dev->devid != srcdev_devid) + continue; + + new->physical = old->physical; + new->dev = dev_replace->tgtdev; + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) + bioc->replace_stripe_src = i; + nr_extra_stripes++; + } + + /* We can only have at most 2 extra nr_stripes (for DUP). */ + ASSERT(nr_extra_stripes <= 2); + /* + * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for + * replace. + * If we have 2 extra stripes, only choose the one with smaller physical. + */ + if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) { + struct btrfs_io_stripe *first = &bioc->stripes[num_stripes]; + struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1]; + + /* Only DUP can have two extra stripes. */ + ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP); + + /* + * Swap the last stripe stripes and reduce @nr_extra_stripes. + * The extra stripe would still be there, but won't be accessed. + */ + if (first->physical > second->physical) { + swap(second->physical, first->physical); + swap(second->dev, first->dev); + nr_extra_stripes--; + } + } + + *num_stripes_ret = num_stripes + nr_extra_stripes; + *max_errors_ret = max_errors + nr_extra_stripes; + bioc->replace_nr_stripes = nr_extra_stripes; +} + +static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, + u64 offset, u32 *stripe_nr, u64 *stripe_offset, + u64 *full_stripe_start) +{ + /* + * Stripe_nr is the stripe where this block falls. stripe_offset is + * the offset of this block in its stripe. + */ + *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; + *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; + ASSERT(*stripe_offset < U32_MAX); + + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + unsigned long full_stripe_len = + btrfs_stripe_nr_to_offset(nr_data_stripes(map)); + + /* + * For full stripe start, we use previously calculated + * @stripe_nr. Align it to nr_data_stripes, then multiply with + * STRIPE_LEN. + * + * By this we can avoid u64 division completely. And we have + * to go rounddown(), not round_down(), as nr_data_stripes is + * not ensured to be power of 2. + */ + *full_stripe_start = + btrfs_stripe_nr_to_offset( + rounddown(*stripe_nr, nr_data_stripes(map))); + + ASSERT(*full_stripe_start + full_stripe_len > offset); + ASSERT(*full_stripe_start <= offset); + /* + * For writes to RAID56, allow to write a full stripe set, but + * no straddling of stripe sets. + */ + if (op == BTRFS_MAP_WRITE) + return full_stripe_len - (offset - *full_stripe_start); + } + + /* + * For other RAID types and for RAID56 reads, allow a single stripe (on + * a single disk). + */ + if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) + return BTRFS_STRIPE_LEN - *stripe_offset; + return U64_MAX; +} + +static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, + u32 stripe_index, u64 stripe_offset, u32 stripe_nr) +{ + dst->dev = map->stripes[stripe_index].dev; + dst->physical = map->stripes[stripe_index].physical + + stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); +} + +/* + * Map one logical range to one or more physical ranges. + * + * @length: (Mandatory) mapped length of this run. + * One logical range can be split into different segments + * due to factors like zones and RAID0/5/6/10 stripe + * boundaries. + * + * @bioc_ret: (Mandatory) returned btrfs_io_context structure. + * which has one or more physical ranges (btrfs_io_stripe) + * recorded inside. + * Caller should call btrfs_put_bioc() to free it after use. + * + * @smap: (Optional) single physical range optimization. + * If the map request can be fulfilled by one single + * physical range, and this is parameter is not NULL, + * then @bioc_ret would be NULL, and @smap would be + * updated. + * + * @mirror_num_ret: (Mandatory) returned mirror number if the original + * value is 0. + * + * Mirror number 0 means to choose any live mirrors. + * + * For non-RAID56 profiles, non-zero mirror_num means + * the Nth mirror. (e.g. mirror_num 1 means the first + * copy). + * + * For RAID56 profile, mirror 1 means rebuild from P and + * the remaining data stripes. + * + * For RAID6 profile, mirror > 2 means mark another + * data/P stripe error and rebuild from the remaining + * stripes.. + * + * @need_raid_map: (Used only for integrity checker) whether the map wants + * a full stripe map (including all data and P/Q stripes) + * for RAID56. Should always be 1 except integrity checker. + */ +int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_io_context **bioc_ret, + struct btrfs_io_stripe *smap, int *mirror_num_ret, + int need_raid_map) +{ + struct extent_map *em; + struct map_lookup *map; + u64 map_offset; + u64 stripe_offset; + u32 stripe_nr; + u32 stripe_index; + int data_stripes; + int i; + int ret = 0; + int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); + int num_stripes; + int num_copies; + int max_errors = 0; + struct btrfs_io_context *bioc = NULL; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int dev_replace_is_ongoing = 0; + u16 num_alloc_stripes; + u64 raid56_full_stripe_start = (u64)-1; + u64 max_len; + + ASSERT(bioc_ret); + + num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); + if (mirror_num > num_copies) + return -EINVAL; + + em = btrfs_get_chunk_map(fs_info, logical, *length); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + data_stripes = nr_data_stripes(map); + + map_offset = logical - em->start; + max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, + &stripe_offset, &raid56_full_stripe_start); + *length = min_t(u64, em->len - map_offset, max_len); + + down_read(&dev_replace->rwsem); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + /* + * Hold the semaphore for read during the whole operation, write is + * requested at commit time but must wait. + */ + if (!dev_replace_is_ongoing) + up_read(&dev_replace->rwsem); + + num_stripes = 1; + stripe_index = 0; + if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + stripe_index = stripe_nr % map->num_stripes; + stripe_nr /= map->num_stripes; + if (op == BTRFS_MAP_READ) + mirror_num = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { + if (op != BTRFS_MAP_READ) { + num_stripes = map->num_stripes; + } else if (mirror_num) { + stripe_index = mirror_num - 1; + } else { + stripe_index = find_live_mirror(fs_info, map, 0, + dev_replace_is_ongoing); + mirror_num = stripe_index + 1; + } + + } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { + if (op != BTRFS_MAP_READ) { + num_stripes = map->num_stripes; + } else if (mirror_num) { + stripe_index = mirror_num - 1; + } else { + mirror_num = 1; + } + + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + u32 factor = map->num_stripes / map->sub_stripes; + + stripe_index = (stripe_nr % factor) * map->sub_stripes; + stripe_nr /= factor; + + if (op != BTRFS_MAP_READ) + num_stripes = map->sub_stripes; + else if (mirror_num) + stripe_index += mirror_num - 1; + else { + int old_stripe_index = stripe_index; + stripe_index = find_live_mirror(fs_info, map, + stripe_index, + dev_replace_is_ongoing); + mirror_num = stripe_index - old_stripe_index + 1; + } + + } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) { + /* + * Push stripe_nr back to the start of the full stripe + * For those cases needing a full stripe, @stripe_nr + * is the full stripe number. + * + * Originally we go raid56_full_stripe_start / full_stripe_len, + * but that can be expensive. Here we just divide + * @stripe_nr with @data_stripes. + */ + stripe_nr /= data_stripes; + + /* RAID[56] write or recovery. Return all stripes */ + num_stripes = map->num_stripes; + max_errors = btrfs_chunk_max_errors(map); + + /* Return the length to the full stripe end */ + *length = min(logical + *length, + raid56_full_stripe_start + em->start + + btrfs_stripe_nr_to_offset(data_stripes)) - + logical; + stripe_index = 0; + stripe_offset = 0; + } else { + /* + * Mirror #0 or #1 means the original data block. + * Mirror #2 is RAID5 parity block. + * Mirror #3 is RAID6 Q block. + */ + stripe_index = stripe_nr % data_stripes; + stripe_nr /= data_stripes; + if (mirror_num > 1) + stripe_index = data_stripes + mirror_num - 2; + + /* We distribute the parity blocks across stripes */ + stripe_index = (stripe_nr + stripe_index) % map->num_stripes; + if (op == BTRFS_MAP_READ && mirror_num <= 1) + mirror_num = 1; + } + } else { + /* + * After this, stripe_nr is the number of stripes on this + * device we have to walk to find the data, and stripe_index is + * the number of our device in the stripe array + */ + stripe_index = stripe_nr % map->num_stripes; + stripe_nr /= map->num_stripes; + mirror_num = stripe_index + 1; + } + if (stripe_index >= map->num_stripes) { + btrfs_crit(fs_info, + "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", + stripe_index, map->num_stripes); + ret = -EINVAL; + goto out; + } + + num_alloc_stripes = num_stripes; + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && + op != BTRFS_MAP_READ) + /* + * For replace case, we need to add extra stripes for extra + * duplicated stripes. + * + * For both WRITE and GET_READ_MIRRORS, we may have at most + * 2 more stripes (DUP types, otherwise 1). + */ + num_alloc_stripes += 2; + + /* + * If this I/O maps to a single device, try to return the device and + * physical block information on the stack instead of allocating an + * I/O context structure. + */ + if (smap && num_alloc_stripes == 1 && + !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) { + set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); + if (mirror_num_ret) + *mirror_num_ret = mirror_num; + *bioc_ret = NULL; + ret = 0; + goto out; + } + + bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes); + if (!bioc) { + ret = -ENOMEM; + goto out; + } + bioc->map_type = map->type; + + /* + * For RAID56 full map, we need to make sure the stripes[] follows the + * rule that data stripes are all ordered, then followed with P and Q + * (if we have). + * + * It's still mostly the same as other profiles, just with extra rotation. + */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && + (op != BTRFS_MAP_READ || mirror_num > 1)) { + /* + * For RAID56 @stripe_nr is already the number of full stripes + * before us, which is also the rotation value (needs to modulo + * with num_stripes). + * + * In this case, we just add @stripe_nr with @i, then do the + * modulo, to reduce one modulo call. + */ + bioc->full_stripe_logical = em->start + + btrfs_stripe_nr_to_offset(stripe_nr * data_stripes); + for (i = 0; i < num_stripes; i++) + set_io_stripe(&bioc->stripes[i], map, + (i + stripe_nr) % num_stripes, + stripe_offset, stripe_nr); + } else { + /* + * For all other non-RAID56 profiles, just copy the target + * stripe into the bioc. + */ + for (i = 0; i < num_stripes; i++) { + set_io_stripe(&bioc->stripes[i], map, stripe_index, + stripe_offset, stripe_nr); + stripe_index++; + } + } + + if (op != BTRFS_MAP_READ) + max_errors = btrfs_chunk_max_errors(map); + + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && + op != BTRFS_MAP_READ) { + handle_ops_on_dev_replace(op, bioc, dev_replace, logical, + &num_stripes, &max_errors); + } + + *bioc_ret = bioc; + bioc->num_stripes = num_stripes; + bioc->max_errors = max_errors; + bioc->mirror_num = mirror_num; + +out: + if (dev_replace_is_ongoing) { + lockdep_assert_held(&dev_replace->rwsem); + /* Unlock and let waiting writers proceed */ + up_read(&dev_replace->rwsem); + } + free_extent_map(em); + return ret; +} + +static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, + const struct btrfs_fs_devices *fs_devices) +{ + if (args->fsid == NULL) + return true; + if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0) + return true; + return false; +} + +static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, + const struct btrfs_device *device) +{ + if (args->missing) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && + !device->bdev) + return true; + return false; + } + + if (device->devid != args->devid) + return false; + if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0) + return false; + return true; +} + +/* + * Find a device specified by @devid or @uuid in the list of @fs_devices, or + * return NULL. + * + * If devid and uuid are both specified, the match must be exact, otherwise + * only devid is used. + */ +struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, + const struct btrfs_dev_lookup_args *args) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *seed_devs; + + if (dev_args_match_fs_devices(args, fs_devices)) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (dev_args_match_device(args, device)) + return device; + } + } + + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + if (!dev_args_match_fs_devices(args, seed_devs)) + continue; + list_for_each_entry(device, &seed_devs->devices, dev_list) { + if (dev_args_match_device(args, device)) + return device; + } + } + + return NULL; +} + +static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, + u64 devid, u8 *dev_uuid) +{ + struct btrfs_device *device; + unsigned int nofs_flag; + + /* + * We call this under the chunk_mutex, so we want to use NOFS for this + * allocation, however we don't want to change btrfs_alloc_device() to + * always do NOFS because we use it in a lot of other GFP_KERNEL safe + * places. + */ + + nofs_flag = memalloc_nofs_save(); + device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL); + memalloc_nofs_restore(nofs_flag); + if (IS_ERR(device)) + return device; + + list_add(&device->dev_list, &fs_devices->devices); + device->fs_devices = fs_devices; + fs_devices->num_devices++; + + set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); + fs_devices->missing_devices++; + + return device; +} + +/* + * Allocate new device struct, set up devid and UUID. + * + * @fs_info: used only for generating a new devid, can be NULL if + * devid is provided (i.e. @devid != NULL). + * @devid: a pointer to devid for this device. If NULL a new devid + * is generated. + * @uuid: a pointer to UUID for this device. If NULL a new UUID + * is generated. + * @path: a pointer to device path if available, NULL otherwise. + * + * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() + * on error. Returned struct is not linked onto any lists and must be + * destroyed with btrfs_free_device. + */ +struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, + const u64 *devid, const u8 *uuid, + const char *path) +{ + struct btrfs_device *dev; + u64 tmp; + + if (WARN_ON(!devid && !fs_info)) + return ERR_PTR(-EINVAL); + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&dev->dev_list); + INIT_LIST_HEAD(&dev->dev_alloc_list); + INIT_LIST_HEAD(&dev->post_commit_list); + + atomic_set(&dev->dev_stats_ccnt, 0); + btrfs_device_data_ordered_init(dev); + extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); + + if (devid) + tmp = *devid; + else { + int ret; + + ret = find_next_devid(fs_info, &tmp); + if (ret) { + btrfs_free_device(dev); + return ERR_PTR(ret); + } + } + dev->devid = tmp; + + if (uuid) + memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); + else + generate_random_uuid(dev->uuid); + + if (path) { + struct rcu_string *name; + + name = rcu_string_strdup(path, GFP_KERNEL); + if (!name) { + btrfs_free_device(dev); + return ERR_PTR(-ENOMEM); + } + rcu_assign_pointer(dev->name, name); + } + + return dev; +} + +static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, + u64 devid, u8 *uuid, bool error) +{ + if (error) + btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", + devid, uuid); + else + btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", + devid, uuid); +} + +u64 btrfs_calc_stripe_length(const struct extent_map *em) +{ + const struct map_lookup *map = em->map_lookup; + const int data_stripes = calc_data_stripes(map->type, map->num_stripes); + + return div_u64(em->len, data_stripes); +} + +#if BITS_PER_LONG == 32 +/* + * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE + * can't be accessed on 32bit systems. + * + * This function do mount time check to reject the fs if it already has + * metadata chunk beyond that limit. + */ +static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, u64 type) +{ + if (!(type & BTRFS_BLOCK_GROUP_METADATA)) + return 0; + + if (logical + length < MAX_LFS_FILESIZE) + return 0; + + btrfs_err_32bit_limit(fs_info); + return -EOVERFLOW; +} + +/* + * This is to give early warning for any metadata chunk reaching + * BTRFS_32BIT_EARLY_WARN_THRESHOLD. + * Although we can still access the metadata, it's not going to be possible + * once the limit is reached. + */ +static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, u64 type) +{ + if (!(type & BTRFS_BLOCK_GROUP_METADATA)) + return; + + if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) + return; + + btrfs_warn_32bit_limit(fs_info); +} +#endif + +static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info, + u64 devid, u8 *uuid) +{ + struct btrfs_device *dev; + + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_report_missing_device(fs_info, devid, uuid, true); + return ERR_PTR(-ENOENT); + } + + dev = add_missing_dev(fs_info->fs_devices, devid, uuid); + if (IS_ERR(dev)) { + btrfs_err(fs_info, "failed to init missing device %llu: %ld", + devid, PTR_ERR(dev)); + return dev; + } + btrfs_report_missing_device(fs_info, devid, uuid, false); + + return dev; +} + +static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, + struct btrfs_chunk *chunk) +{ + BTRFS_DEV_LOOKUP_ARGS(args); + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + u64 logical; + u64 length; + u64 devid; + u64 type; + u8 uuid[BTRFS_UUID_SIZE]; + int index; + int num_stripes; + int ret; + int i; + + logical = key->offset; + length = btrfs_chunk_length(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); + index = btrfs_bg_flags_to_raid_index(type); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + +#if BITS_PER_LONG == 32 + ret = check_32bit_meta_chunk(fs_info, logical, length, type); + if (ret < 0) + return ret; + warn_32bit_meta_chunk(fs_info, logical, length, type); +#endif + + /* + * Only need to verify chunk item if we're reading from sys chunk array, + * as chunk item in tree block is already verified by tree-checker. + */ + if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { + ret = btrfs_check_chunk_valid(leaf, chunk, logical); + if (ret) + return ret; + } + + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, logical, 1); + read_unlock(&map_tree->lock); + + /* already mapped? */ + if (em && em->start <= logical && em->start + em->len > logical) { + free_extent_map(em); + return 0; + } else if (em) { + free_extent_map(em); + } + + em = alloc_extent_map(); + if (!em) + return -ENOMEM; + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) { + free_extent_map(em); + return -ENOMEM; + } + + set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); + em->map_lookup = map; + em->start = logical; + em->len = length; + em->orig_start = 0; + em->block_start = 0; + em->block_len = em->len; + + map->num_stripes = num_stripes; + map->io_width = btrfs_chunk_io_width(leaf, chunk); + map->io_align = btrfs_chunk_io_align(leaf, chunk); + map->type = type; + /* + * We can't use the sub_stripes value, as for profiles other than + * RAID10, they may have 0 as sub_stripes for filesystems created by + * older mkfs (sub_stripes = btrfs_raid_array[index].sub_stripes; + map->verified_stripes = 0; + em->orig_block_len = btrfs_calc_stripe_length(em); + for (i = 0; i < num_stripes; i++) { + map->stripes[i].physical = + btrfs_stripe_offset_nr(leaf, chunk, i); + devid = btrfs_stripe_devid_nr(leaf, chunk, i); + args.devid = devid; + read_extent_buffer(leaf, uuid, (unsigned long) + btrfs_stripe_dev_uuid_nr(chunk, i), + BTRFS_UUID_SIZE); + args.uuid = uuid; + map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); + if (!map->stripes[i].dev) { + map->stripes[i].dev = handle_missing_device(fs_info, + devid, uuid); + if (IS_ERR(map->stripes[i].dev)) { + ret = PTR_ERR(map->stripes[i].dev); + free_extent_map(em); + return ret; + } + } + + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &(map->stripes[i].dev->dev_state)); + } + + write_lock(&map_tree->lock); + ret = add_extent_mapping(map_tree, em, 0); + write_unlock(&map_tree->lock); + if (ret < 0) { + btrfs_err(fs_info, + "failed to add chunk map, start=%llu len=%llu: %d", + em->start, em->len, ret); + } + free_extent_map(em); + + return ret; +} + +static void fill_device_from_item(struct extent_buffer *leaf, + struct btrfs_dev_item *dev_item, + struct btrfs_device *device) +{ + unsigned long ptr; + + device->devid = btrfs_device_id(leaf, dev_item); + device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->total_bytes = device->disk_total_bytes; + device->commit_total_bytes = device->disk_total_bytes; + device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); + device->commit_bytes_used = device->bytes_used; + device->type = btrfs_device_type(leaf, dev_item); + device->io_align = btrfs_device_io_align(leaf, dev_item); + device->io_width = btrfs_device_io_width(leaf, dev_item); + device->sector_size = btrfs_device_sector_size(leaf, dev_item); + WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); + clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); + + ptr = btrfs_device_uuid(dev_item); + read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); +} + +static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, + u8 *fsid) +{ + struct btrfs_fs_devices *fs_devices; + int ret; + + lockdep_assert_held(&uuid_mutex); + ASSERT(fsid); + + /* This will match only for multi-device seed fs */ + list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) + if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) + return fs_devices; + + + fs_devices = find_fsid(fsid, NULL); + if (!fs_devices) { + if (!btrfs_test_opt(fs_info, DEGRADED)) + return ERR_PTR(-ENOENT); + + fs_devices = alloc_fs_devices(fsid, NULL); + if (IS_ERR(fs_devices)) + return fs_devices; + + fs_devices->seeding = true; + fs_devices->opened = 1; + return fs_devices; + } + + /* + * Upon first call for a seed fs fsid, just create a private copy of the + * respective fs_devices and anchor it at fs_info->fs_devices->seed_list + */ + fs_devices = clone_fs_devices(fs_devices); + if (IS_ERR(fs_devices)) + return fs_devices; + + ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder); + if (ret) { + free_fs_devices(fs_devices); + return ERR_PTR(ret); + } + + if (!fs_devices->seeding) { + close_fs_devices(fs_devices); + free_fs_devices(fs_devices); + return ERR_PTR(-EINVAL); + } + + list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); + + return fs_devices; +} + +static int read_one_dev(struct extent_buffer *leaf, + struct btrfs_dev_item *dev_item) +{ + BTRFS_DEV_LOOKUP_ARGS(args); + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 devid; + int ret; + u8 fs_uuid[BTRFS_FSID_SIZE]; + u8 dev_uuid[BTRFS_UUID_SIZE]; + + devid = btrfs_device_id(leaf, dev_item); + args.devid = devid; + read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), + BTRFS_UUID_SIZE); + read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), + BTRFS_FSID_SIZE); + args.uuid = dev_uuid; + args.fsid = fs_uuid; + + if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { + fs_devices = open_seed_devices(fs_info, fs_uuid); + if (IS_ERR(fs_devices)) + return PTR_ERR(fs_devices); + } + + device = btrfs_find_device(fs_info->fs_devices, &args); + if (!device) { + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_report_missing_device(fs_info, devid, + dev_uuid, true); + return -ENOENT; + } + + device = add_missing_dev(fs_devices, devid, dev_uuid); + if (IS_ERR(device)) { + btrfs_err(fs_info, + "failed to add missing dev %llu: %ld", + devid, PTR_ERR(device)); + return PTR_ERR(device); + } + btrfs_report_missing_device(fs_info, devid, dev_uuid, false); + } else { + if (!device->bdev) { + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_report_missing_device(fs_info, + devid, dev_uuid, true); + return -ENOENT; + } + btrfs_report_missing_device(fs_info, devid, + dev_uuid, false); + } + + if (!device->bdev && + !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { + /* + * this happens when a device that was properly setup + * in the device info lists suddenly goes bad. + * device->bdev is NULL, and so we have to set + * device->missing to one here + */ + device->fs_devices->missing_devices++; + set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); + } + + /* Move the device to its own fs_devices */ + if (device->fs_devices != fs_devices) { + ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, + &device->dev_state)); + + list_move(&device->dev_list, &fs_devices->devices); + device->fs_devices->num_devices--; + fs_devices->num_devices++; + + device->fs_devices->missing_devices--; + fs_devices->missing_devices++; + + device->fs_devices = fs_devices; + } + } + + if (device->fs_devices != fs_info->fs_devices) { + BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); + if (device->generation != + btrfs_device_generation(leaf, dev_item)) + return -EINVAL; + } + + fill_device_from_item(leaf, dev_item, device); + if (device->bdev) { + u64 max_total_bytes = bdev_nr_bytes(device->bdev); + + if (device->total_bytes > max_total_bytes) { + btrfs_err(fs_info, + "device total_bytes should be at most %llu but found %llu", + max_total_bytes, device->total_bytes); + return -EINVAL; + } + } + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && + !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { + device->fs_devices->total_rw_bytes += device->total_bytes; + atomic64_add(device->total_bytes - device->bytes_used, + &fs_info->free_chunk_space); + } + ret = 0; + return ret; +} + +int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) +{ + struct btrfs_super_block *super_copy = fs_info->super_copy; + struct extent_buffer *sb; + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + u8 *array_ptr; + unsigned long sb_array_offset; + int ret = 0; + u32 num_stripes; + u32 array_size; + u32 len = 0; + u32 cur_offset; + u64 type; + struct btrfs_key key; + + ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); + + /* + * We allocated a dummy extent, just to use extent buffer accessors. + * There will be unused space after BTRFS_SUPER_INFO_SIZE, but + * that's fine, we will not go beyond system chunk array anyway. + */ + sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET); + if (!sb) + return -ENOMEM; + set_extent_buffer_uptodate(sb); + + write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); + array_size = btrfs_super_sys_array_size(super_copy); + + array_ptr = super_copy->sys_chunk_array; + sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); + cur_offset = 0; + + while (cur_offset < array_size) { + disk_key = (struct btrfs_disk_key *)array_ptr; + len = sizeof(*disk_key); + if (cur_offset + len > array_size) + goto out_short_read; + + btrfs_disk_key_to_cpu(&key, disk_key); + + array_ptr += len; + sb_array_offset += len; + cur_offset += len; + + if (key.type != BTRFS_CHUNK_ITEM_KEY) { + btrfs_err(fs_info, + "unexpected item type %u in sys_array at offset %u", + (u32)key.type, cur_offset); + ret = -EIO; + break; + } + + chunk = (struct btrfs_chunk *)sb_array_offset; + /* + * At least one btrfs_chunk with one stripe must be present, + * exact stripe count check comes afterwards + */ + len = btrfs_chunk_item_size(1); + if (cur_offset + len > array_size) + goto out_short_read; + + num_stripes = btrfs_chunk_num_stripes(sb, chunk); + if (!num_stripes) { + btrfs_err(fs_info, + "invalid number of stripes %u in sys_array at offset %u", + num_stripes, cur_offset); + ret = -EIO; + break; + } + + type = btrfs_chunk_type(sb, chunk); + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { + btrfs_err(fs_info, + "invalid chunk type %llu in sys_array at offset %u", + type, cur_offset); + ret = -EIO; + break; + } + + len = btrfs_chunk_item_size(num_stripes); + if (cur_offset + len > array_size) + goto out_short_read; + + ret = read_one_chunk(&key, sb, chunk); + if (ret) + break; + + array_ptr += len; + sb_array_offset += len; + cur_offset += len; + } + clear_extent_buffer_uptodate(sb); + free_extent_buffer_stale(sb); + return ret; + +out_short_read: + btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", + len, cur_offset); + clear_extent_buffer_uptodate(sb); + free_extent_buffer_stale(sb); + return -EIO; +} + +/* + * Check if all chunks in the fs are OK for read-write degraded mount + * + * If the @failing_dev is specified, it's accounted as missing. + * + * Return true if all chunks meet the minimal RW mount requirements. + * Return false if any chunk doesn't meet the minimal RW mount requirements. + */ +bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, + struct btrfs_device *failing_dev) +{ + struct extent_map_tree *map_tree = &fs_info->mapping_tree; + struct extent_map *em; + u64 next_start = 0; + bool ret = true; + + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, 0, (u64)-1); + read_unlock(&map_tree->lock); + /* No chunk at all? Return false anyway */ + if (!em) { + ret = false; + goto out; + } + while (em) { + struct map_lookup *map; + int missing = 0; + int max_tolerated; + int i; + + map = em->map_lookup; + max_tolerated = + btrfs_get_num_tolerated_disk_barrier_failures( + map->type); + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *dev = map->stripes[i].dev; + + if (!dev || !dev->bdev || + test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || + dev->last_flush_error) + missing++; + else if (failing_dev && failing_dev == dev) + missing++; + } + if (missing > max_tolerated) { + if (!failing_dev) + btrfs_warn(fs_info, + "chunk %llu missing %d devices, max tolerance is %d for writable mount", + em->start, missing, max_tolerated); + free_extent_map(em); + ret = false; + goto out; + } + next_start = extent_map_end(em); + free_extent_map(em); + + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, next_start, + (u64)(-1) - next_start); + read_unlock(&map_tree->lock); + } +out: + return ret; +} + +static void readahead_tree_node_children(struct extent_buffer *node) +{ + int i; + const int nr_items = btrfs_header_nritems(node); + + for (i = 0; i < nr_items; i++) + btrfs_readahead_node_child(node, i); +} + +int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root = fs_info->chunk_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + struct btrfs_key found_key; + int ret; + int slot; + int iter_ret = 0; + u64 total_dev = 0; + u64 last_ra_node = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * uuid_mutex is needed only if we are mounting a sprout FS + * otherwise we don't need it. + */ + mutex_lock(&uuid_mutex); + + /* + * It is possible for mount and umount to race in such a way that + * we execute this code path, but open_fs_devices failed to clear + * total_rw_bytes. We certainly want it cleared before reading the + * device items, so clear it here. + */ + fs_info->fs_devices->total_rw_bytes = 0; + + /* + * Lockdep complains about possible circular locking dependency between + * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores + * used for freeze procection of a fs (struct super_block.s_writers), + * which we take when starting a transaction, and extent buffers of the + * chunk tree if we call read_one_dev() while holding a lock on an + * extent buffer of the chunk tree. Since we are mounting the filesystem + * and at this point there can't be any concurrent task modifying the + * chunk tree, to keep it simple, just skip locking on the chunk tree. + */ + ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); + path->skip_locking = 1; + + /* + * Read all device items, and then all the chunk items. All + * device items are found before any chunk item (their object id + * is smaller than the lowest possible object id for a chunk + * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). + */ + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.offset = 0; + key.type = 0; + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { + struct extent_buffer *node = path->nodes[1]; + + leaf = path->nodes[0]; + slot = path->slots[0]; + + if (node) { + if (last_ra_node != node->start) { + readahead_tree_node_children(node); + last_ra_node = node->start; + } + } + if (found_key.type == BTRFS_DEV_ITEM_KEY) { + struct btrfs_dev_item *dev_item; + dev_item = btrfs_item_ptr(leaf, slot, + struct btrfs_dev_item); + ret = read_one_dev(leaf, dev_item); + if (ret) + goto error; + total_dev++; + } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { + struct btrfs_chunk *chunk; + + /* + * We are only called at mount time, so no need to take + * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, + * we always lock first fs_info->chunk_mutex before + * acquiring any locks on the chunk tree. This is a + * requirement for chunk allocation, see the comment on + * top of btrfs_chunk_alloc() for details. + */ + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + ret = read_one_chunk(&found_key, leaf, chunk); + if (ret) + goto error; + } + } + /* Catch error found during iteration */ + if (iter_ret < 0) { + ret = iter_ret; + goto error; + } + + /* + * After loading chunk tree, we've got all device information, + * do another round of validation checks. + */ + if (total_dev != fs_info->fs_devices->total_devices) { + btrfs_warn(fs_info, +"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", + btrfs_super_num_devices(fs_info->super_copy), + total_dev); + fs_info->fs_devices->total_devices = total_dev; + btrfs_set_super_num_devices(fs_info->super_copy, total_dev); + } + if (btrfs_super_total_bytes(fs_info->super_copy) < + fs_info->fs_devices->total_rw_bytes) { + btrfs_err(fs_info, + "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", + btrfs_super_total_bytes(fs_info->super_copy), + fs_info->fs_devices->total_rw_bytes); + ret = -EINVAL; + goto error; + } + ret = 0; +error: + mutex_unlock(&uuid_mutex); + + btrfs_free_path(path); + return ret; +} + +int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; + struct btrfs_device *device; + int ret = 0; + + fs_devices->fs_info = fs_info; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->fs_info = fs_info; + + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed_devs->devices, dev_list) { + device->fs_info = fs_info; + ret = btrfs_get_dev_zone_info(device, false); + if (ret) + break; + } + + seed_devs->fs_info = fs_info; + } + mutex_unlock(&fs_devices->device_list_mutex); + + return ret; +} + +static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, + const struct btrfs_dev_stats_item *ptr, + int index) +{ + u64 val; + + read_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); + return val; +} + +static void btrfs_set_dev_stats_value(struct extent_buffer *eb, + struct btrfs_dev_stats_item *ptr, + int index, u64 val) +{ + write_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); +} + +static int btrfs_device_init_dev_stats(struct btrfs_device *device, + struct btrfs_path *path) +{ + struct btrfs_dev_stats_item *ptr; + struct extent_buffer *eb; + struct btrfs_key key; + int item_size; + int i, ret, slot; + + if (!device->fs_info->dev_root) + return 0; + + key.objectid = BTRFS_DEV_STATS_OBJECTID; + key.type = BTRFS_PERSISTENT_ITEM_KEY; + key.offset = device->devid; + ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); + if (ret) { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_dev_stat_set(device, i, 0); + device->dev_stats_valid = 1; + btrfs_release_path(path); + return ret < 0 ? ret : 0; + } + slot = path->slots[0]; + eb = path->nodes[0]; + item_size = btrfs_item_size(eb, slot); + + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (item_size >= (1 + i) * sizeof(__le64)) + btrfs_dev_stat_set(device, i, + btrfs_dev_stats_value(eb, ptr, i)); + else + btrfs_dev_stat_set(device, i, 0); + } + + device->dev_stats_valid = 1; + btrfs_dev_stat_print_on_load(device); + btrfs_release_path(path); + + return 0; +} + +int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; + struct btrfs_device *device; + struct btrfs_path *path = NULL; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + ret = btrfs_device_init_dev_stats(device, path); + if (ret) + goto out; + } + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed_devs->devices, dev_list) { + ret = btrfs_device_init_dev_stats(device, path); + if (ret) + goto out; + } + } +out: + mutex_unlock(&fs_devices->device_list_mutex); + + btrfs_free_path(path); + return ret; +} + +static int update_dev_stat_item(struct btrfs_trans_handle *trans, + struct btrfs_device *device) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_dev_stats_item *ptr; + int ret; + int i; + + key.objectid = BTRFS_DEV_STATS_OBJECTID; + key.type = BTRFS_PERSISTENT_ITEM_KEY; + key.offset = device->devid; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); + if (ret < 0) { + btrfs_warn_in_rcu(fs_info, + "error %d while searching for dev_stats item for device %s", + ret, btrfs_dev_name(device)); + goto out; + } + + if (ret == 0 && + btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + /* need to delete old one and insert a new one */ + ret = btrfs_del_item(trans, dev_root, path); + if (ret != 0) { + btrfs_warn_in_rcu(fs_info, + "delete too small dev_stats item for device %s failed %d", + btrfs_dev_name(device), ret); + goto out; + } + ret = 1; + } + + if (ret == 1) { + /* need to insert a new item */ + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, dev_root, path, + &key, sizeof(*ptr)); + if (ret < 0) { + btrfs_warn_in_rcu(fs_info, + "insert dev_stats item for device %s failed %d", + btrfs_dev_name(device), ret); + goto out; + } + } + + eb = path->nodes[0]; + ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_set_dev_stats_value(eb, ptr, i, + btrfs_dev_stat_read(device, i)); + btrfs_mark_buffer_dirty(trans, eb); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * called from commit_transaction. Writes all changed device stats to disk. + */ +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + int stats_cnt; + int ret = 0; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + stats_cnt = atomic_read(&device->dev_stats_ccnt); + if (!device->dev_stats_valid || stats_cnt == 0) + continue; + + + /* + * There is a LOAD-LOAD control dependency between the value of + * dev_stats_ccnt and updating the on-disk values which requires + * reading the in-memory counters. Such control dependencies + * require explicit read memory barriers. + * + * This memory barriers pairs with smp_mb__before_atomic in + * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full + * barrier implied by atomic_xchg in + * btrfs_dev_stats_read_and_reset + */ + smp_rmb(); + + ret = update_dev_stat_item(trans, device); + if (!ret) + atomic_sub(stats_cnt, &device->dev_stats_ccnt); + } + mutex_unlock(&fs_devices->device_list_mutex); + + return ret; +} + +void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) +{ + btrfs_dev_stat_inc(dev, index); + + if (!dev->dev_stats_valid) + return; + btrfs_err_rl_in_rcu(dev->fs_info, + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", + btrfs_dev_name(dev), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); +} + +static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) +{ + int i; + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + if (btrfs_dev_stat_read(dev, i) != 0) + break; + if (i == BTRFS_DEV_STAT_VALUES_MAX) + return; /* all values == 0, suppress message */ + + btrfs_info_in_rcu(dev->fs_info, + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", + btrfs_dev_name(dev), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); +} + +int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_get_dev_stats *stats) +{ + BTRFS_DEV_LOOKUP_ARGS(args); + struct btrfs_device *dev; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + int i; + + mutex_lock(&fs_devices->device_list_mutex); + args.devid = stats->devid; + dev = btrfs_find_device(fs_info->fs_devices, &args); + mutex_unlock(&fs_devices->device_list_mutex); + + if (!dev) { + btrfs_warn(fs_info, "get dev_stats failed, device not found"); + return -ENODEV; + } else if (!dev->dev_stats_valid) { + btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); + return -ENODEV; + } else if (stats->flags & BTRFS_DEV_STATS_RESET) { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (stats->nr_items > i) + stats->values[i] = + btrfs_dev_stat_read_and_reset(dev, i); + else + btrfs_dev_stat_set(dev, i, 0); + } + btrfs_info(fs_info, "device stats zeroed by %s (%d)", + current->comm, task_pid_nr(current)); + } else { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + if (stats->nr_items > i) + stats->values[i] = btrfs_dev_stat_read(dev, i); + } + if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) + stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; + return 0; +} + +/* + * Update the size and bytes used for each device where it changed. This is + * delayed since we would otherwise get errors while writing out the + * superblocks. + * + * Must be invoked during transaction commit. + */ +void btrfs_commit_device_sizes(struct btrfs_transaction *trans) +{ + struct btrfs_device *curr, *next; + + ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); + + if (list_empty(&trans->dev_update_list)) + return; + + /* + * We don't need the device_list_mutex here. This list is owned by the + * transaction and the transaction must complete before the device is + * released. + */ + mutex_lock(&trans->fs_info->chunk_mutex); + list_for_each_entry_safe(curr, next, &trans->dev_update_list, + post_commit_list) { + list_del_init(&curr->post_commit_list); + curr->commit_total_bytes = curr->disk_total_bytes; + curr->commit_bytes_used = curr->bytes_used; + } + mutex_unlock(&trans->fs_info->chunk_mutex); +} + +/* + * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. + */ +int btrfs_bg_type_to_factor(u64 flags) +{ + const int index = btrfs_bg_flags_to_raid_index(flags); + + return btrfs_raid_array[index].ncopies; +} + + + +static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, + u64 chunk_offset, u64 devid, + u64 physical_offset, u64 physical_len) +{ + struct btrfs_dev_lookup_args args = { .devid = devid }; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + struct btrfs_device *dev; + u64 stripe_len; + bool found = false; + int ret = 0; + int i; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); + read_unlock(&em_tree->lock); + + if (!em) { + btrfs_err(fs_info, +"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", + physical_offset, devid); + ret = -EUCLEAN; + goto out; + } + + map = em->map_lookup; + stripe_len = btrfs_calc_stripe_length(em); + if (physical_len != stripe_len) { + btrfs_err(fs_info, +"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", + physical_offset, devid, em->start, physical_len, + stripe_len); + ret = -EUCLEAN; + goto out; + } + + /* + * Very old mkfs.btrfs (before v4.1) will not respect the reserved + * space. Although kernel can handle it without problem, better to warn + * the users. + */ + if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED) + btrfs_warn(fs_info, + "devid %llu physical %llu len %llu inside the reserved space", + devid, physical_offset, physical_len); + + for (i = 0; i < map->num_stripes; i++) { + if (map->stripes[i].dev->devid == devid && + map->stripes[i].physical == physical_offset) { + found = true; + if (map->verified_stripes >= map->num_stripes) { + btrfs_err(fs_info, + "too many dev extents for chunk %llu found", + em->start); + ret = -EUCLEAN; + goto out; + } + map->verified_stripes++; + break; + } + } + if (!found) { + btrfs_err(fs_info, + "dev extent physical offset %llu devid %llu has no corresponding chunk", + physical_offset, devid); + ret = -EUCLEAN; + } + + /* Make sure no dev extent is beyond device boundary */ + dev = btrfs_find_device(fs_info->fs_devices, &args); + if (!dev) { + btrfs_err(fs_info, "failed to find devid %llu", devid); + ret = -EUCLEAN; + goto out; + } + + if (physical_offset + physical_len > dev->disk_total_bytes) { + btrfs_err(fs_info, +"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", + devid, physical_offset, physical_len, + dev->disk_total_bytes); + ret = -EUCLEAN; + goto out; + } + + if (dev->zone_info) { + u64 zone_size = dev->zone_info->zone_size; + + if (!IS_ALIGNED(physical_offset, zone_size) || + !IS_ALIGNED(physical_len, zone_size)) { + btrfs_err(fs_info, +"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", + devid, physical_offset, physical_len); + ret = -EUCLEAN; + goto out; + } + } + +out: + free_extent_map(em); + return ret; +} + +static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct rb_node *node; + int ret = 0; + + read_lock(&em_tree->lock); + for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { + em = rb_entry(node, struct extent_map, rb_node); + if (em->map_lookup->num_stripes != + em->map_lookup->verified_stripes) { + btrfs_err(fs_info, + "chunk %llu has missing dev extent, have %d expect %d", + em->start, em->map_lookup->verified_stripes, + em->map_lookup->num_stripes); + ret = -EUCLEAN; + goto out; + } + } +out: + read_unlock(&em_tree->lock); + return ret; +} + +/* + * Ensure that all dev extents are mapped to correct chunk, otherwise + * later chunk allocation/free would cause unexpected behavior. + * + * NOTE: This will iterate through the whole device tree, which should be of + * the same size level as the chunk tree. This slightly increases mount time. + */ +int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) +{ + struct btrfs_path *path; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_key key; + u64 prev_devid = 0; + u64 prev_dev_ext_end = 0; + int ret = 0; + + /* + * We don't have a dev_root because we mounted with ignorebadroots and + * failed to load the root, so we want to skip the verification in this + * case for sure. + * + * However if the dev root is fine, but the tree itself is corrupted + * we'd still fail to mount. This verification is only to make sure + * writes can happen safely, so instead just bypass this check + * completely in the case of IGNOREBADROOTS. + */ + if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) + return 0; + + key.objectid = 1; + key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = READA_FORWARD; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + /* No dev extents at all? Not good */ + if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + while (1) { + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_dev_extent *dext; + int slot = path->slots[0]; + u64 chunk_offset; + u64 physical_offset; + u64 physical_len; + u64 devid; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.type != BTRFS_DEV_EXTENT_KEY) + break; + devid = key.objectid; + physical_offset = key.offset; + + dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); + chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); + physical_len = btrfs_dev_extent_length(leaf, dext); + + /* Check if this dev extent overlaps with the previous one */ + if (devid == prev_devid && physical_offset < prev_dev_ext_end) { + btrfs_err(fs_info, +"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", + devid, physical_offset, prev_dev_ext_end); + ret = -EUCLEAN; + goto out; + } + + ret = verify_one_dev_extent(fs_info, chunk_offset, devid, + physical_offset, physical_len); + if (ret < 0) + goto out; + prev_devid = devid; + prev_dev_ext_end = physical_offset + physical_len; + + ret = btrfs_next_item(root, path); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + break; + } + } + + /* Ensure all chunks have corresponding dev extents */ + ret = verify_chunk_dev_extent_mapping(fs_info); +out: + btrfs_free_path(path); + return ret; +} + +/* + * Check whether the given block group or device is pinned by any inode being + * used as a swapfile. + */ +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) +{ + struct btrfs_swapfile_pin *sp; + struct rb_node *node; + + spin_lock(&fs_info->swapfile_pins_lock); + node = fs_info->swapfile_pins.rb_node; + while (node) { + sp = rb_entry(node, struct btrfs_swapfile_pin, node); + if (ptr < sp->ptr) + node = node->rb_left; + else if (ptr > sp->ptr) + node = node->rb_right; + else + break; + } + spin_unlock(&fs_info->swapfile_pins_lock); + return node != NULL; +} + +static int relocating_repair_kthread(void *data) +{ + struct btrfs_block_group *cache = data; + struct btrfs_fs_info *fs_info = cache->fs_info; + u64 target; + int ret = 0; + + target = cache->start; + btrfs_put_block_group(cache); + + sb_start_write(fs_info->sb); + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + btrfs_info(fs_info, + "zoned: skip relocating block group %llu to repair: EBUSY", + target); + sb_end_write(fs_info->sb); + return -EBUSY; + } + + mutex_lock(&fs_info->reclaim_bgs_lock); + + /* Ensure block group still exists */ + cache = btrfs_lookup_block_group(fs_info, target); + if (!cache) + goto out; + + if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) + goto out; + + ret = btrfs_may_alloc_data_chunk(fs_info, target); + if (ret < 0) + goto out; + + btrfs_info(fs_info, + "zoned: relocating block group %llu to repair IO failure", + target); + ret = btrfs_relocate_chunk(fs_info, target); + +out: + if (cache) + btrfs_put_block_group(cache); + mutex_unlock(&fs_info->reclaim_bgs_lock); + btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); + + return ret; +} + +bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) +{ + struct btrfs_block_group *cache; + + if (!btrfs_is_zoned(fs_info)) + return false; + + /* Do not attempt to repair in degraded state */ + if (btrfs_test_opt(fs_info, DEGRADED)) + return true; + + cache = btrfs_lookup_block_group(fs_info, logical); + if (!cache) + return true; + + if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) { + btrfs_put_block_group(cache); + return true; + } + + kthread_run(relocating_repair_kthread, cache, + "btrfs-relocating-repair"); + + return true; +} + +static void map_raid56_repair_block(struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, + u64 logical) +{ + int data_stripes = nr_bioc_data_stripes(bioc); + int i; + + for (i = 0; i < data_stripes; i++) { + u64 stripe_start = bioc->full_stripe_logical + + btrfs_stripe_nr_to_offset(i); + + if (logical >= stripe_start && + logical < stripe_start + BTRFS_STRIPE_LEN) + break; + } + ASSERT(i < data_stripes); + smap->dev = bioc->stripes[i].dev; + smap->physical = bioc->stripes[i].physical + + ((logical - bioc->full_stripe_logical) & + BTRFS_STRIPE_LEN_MASK); +} + +/* + * Map a repair write into a single device. + * + * A repair write is triggered by read time repair or scrub, which would only + * update the contents of a single device. + * Not update any other mirrors nor go through RMW path. + * + * Callers should ensure: + * + * - Call btrfs_bio_counter_inc_blocked() first + * - The range does not cross stripe boundary + * - Has a valid @mirror_num passed in. + */ +int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, + struct btrfs_io_stripe *smap, u64 logical, + u32 length, int mirror_num) +{ + struct btrfs_io_context *bioc = NULL; + u64 map_length = length; + int mirror_ret = mirror_num; + int ret; + + ASSERT(mirror_num > 0); + + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, + &bioc, smap, &mirror_ret, true); + if (ret < 0) + return ret; + + /* The map range should not cross stripe boundary. */ + ASSERT(map_length >= length); + + /* Already mapped to single stripe. */ + if (!bioc) + goto out; + + /* Map the RAID56 multi-stripe writes to a single one. */ + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + map_raid56_repair_block(bioc, smap, logical); + goto out; + } + + ASSERT(mirror_num <= bioc->num_stripes); + smap->dev = bioc->stripes[mirror_num - 1].dev; + smap->physical = bioc->stripes[mirror_num - 1].physical; +out: + btrfs_put_bioc(bioc); + ASSERT(smap->dev); + return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h new file mode 100644 index 0000000000..2128a032c3 --- /dev/null +++ b/fs/btrfs/volumes.h @@ -0,0 +1,752 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#ifndef BTRFS_VOLUMES_H +#define BTRFS_VOLUMES_H + +#include +#include +#include "async-thread.h" +#include "messages.h" +#include "tree-checker.h" +#include "rcu-string.h" + +#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) + +extern struct mutex uuid_mutex; + +#define BTRFS_STRIPE_LEN SZ_64K +#define BTRFS_STRIPE_LEN_SHIFT (16) +#define BTRFS_STRIPE_LEN_MASK (BTRFS_STRIPE_LEN - 1) + +static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT); + +/* Used by sanity check for btrfs_raid_types. */ +#define const_ffs(n) (__builtin_ctzll(n) + 1) + +/* + * The conversion from BTRFS_BLOCK_GROUP_* bits to btrfs_raid_type requires + * RAID0 always to be the lowest profile bit. + * Although it's part of on-disk format and should never change, do extra + * compile-time sanity checks. + */ +static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) < + const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0)); +static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) > + ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK)); + +/* ilog2() can handle both constants and variables */ +#define BTRFS_BG_FLAG_TO_INDEX(profile) \ + ilog2((profile) >> (ilog2(BTRFS_BLOCK_GROUP_RAID0) - 1)) + +enum btrfs_raid_types { + /* SINGLE is the special one as it doesn't have on-disk bit. */ + BTRFS_RAID_SINGLE = 0, + + BTRFS_RAID_RAID0 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID0), + BTRFS_RAID_RAID1 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1), + BTRFS_RAID_DUP = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_DUP), + BTRFS_RAID_RAID10 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID10), + BTRFS_RAID_RAID5 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID5), + BTRFS_RAID_RAID6 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID6), + BTRFS_RAID_RAID1C3 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C3), + BTRFS_RAID_RAID1C4 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C4), + + BTRFS_NR_RAID_TYPES +}; + +/* + * Use sequence counter to get consistent device stat data on + * 32-bit processors. + */ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#include +#define __BTRFS_NEED_DEVICE_DATA_ORDERED +#define btrfs_device_data_ordered_init(device) \ + seqcount_init(&device->data_seqcount) +#else +#define btrfs_device_data_ordered_init(device) do { } while (0) +#endif + +#define BTRFS_DEV_STATE_WRITEABLE (0) +#define BTRFS_DEV_STATE_IN_FS_METADATA (1) +#define BTRFS_DEV_STATE_MISSING (2) +#define BTRFS_DEV_STATE_REPLACE_TGT (3) +#define BTRFS_DEV_STATE_FLUSH_SENT (4) +#define BTRFS_DEV_STATE_NO_READA (5) + +struct btrfs_zoned_device_info; + +struct btrfs_device { + struct list_head dev_list; /* device_list_mutex */ + struct list_head dev_alloc_list; /* chunk mutex */ + struct list_head post_commit_list; /* chunk mutex */ + struct btrfs_fs_devices *fs_devices; + struct btrfs_fs_info *fs_info; + + struct rcu_string __rcu *name; + + u64 generation; + + struct block_device *bdev; + + struct btrfs_zoned_device_info *zone_info; + + /* block device holder for blkdev_get/put */ + void *holder; + + /* + * Device's major-minor number. Must be set even if the device is not + * opened (bdev == NULL), unless the device is missing. + */ + dev_t devt; + unsigned long dev_state; + blk_status_t last_flush_error; + +#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED + seqcount_t data_seqcount; +#endif + + /* the internal btrfs device id */ + u64 devid; + + /* size of the device in memory */ + u64 total_bytes; + + /* size of the device on disk */ + u64 disk_total_bytes; + + /* bytes used */ + u64 bytes_used; + + /* optimal io alignment for this device */ + u32 io_align; + + /* optimal io width for this device */ + u32 io_width; + /* type and info about this device */ + u64 type; + + /* minimal io size for this device */ + u32 sector_size; + + /* physical drive uuid (or lvm uuid) */ + u8 uuid[BTRFS_UUID_SIZE]; + + /* + * size of the device on the current transaction + * + * This variant is update when committing the transaction, + * and protected by chunk mutex + */ + u64 commit_total_bytes; + + /* bytes used on the current transaction */ + u64 commit_bytes_used; + + /* Bio used for flushing device barriers */ + struct bio flush_bio; + struct completion flush_wait; + + /* per-device scrub information */ + struct scrub_ctx *scrub_ctx; + + /* disk I/O failure stats. For detailed description refer to + * enum btrfs_dev_stat_values in ioctl.h */ + int dev_stats_valid; + + /* Counter to record the change of device stats */ + atomic_t dev_stats_ccnt; + atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; + + struct extent_io_tree alloc_state; + + struct completion kobj_unregister; + /* For sysfs/FSID/devinfo/devid/ */ + struct kobject devid_kobj; + + /* Bandwidth limit for scrub, in bytes */ + u64 scrub_speed_max; +}; + +/* + * Block group or device which contains an active swapfile. Used for preventing + * unsafe operations while a swapfile is active. + * + * These are sorted on (ptr, inode) (note that a block group or device can + * contain more than one swapfile). We compare the pointer values because we + * don't actually care what the object is, we just need a quick check whether + * the object exists in the rbtree. + */ +struct btrfs_swapfile_pin { + struct rb_node node; + void *ptr; + struct inode *inode; + /* + * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr + * points to a struct btrfs_device. + */ + bool is_block_group; + /* + * Only used when 'is_block_group' is true and it is the number of + * extents used by a swapfile for this block group ('ptr' field). + */ + int bg_extent_count; +}; + +/* + * If we read those variants at the context of their own lock, we needn't + * use the following helpers, reading them directly is safe. + */ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#define BTRFS_DEVICE_GETSET_FUNCS(name) \ +static inline u64 \ +btrfs_device_get_##name(const struct btrfs_device *dev) \ +{ \ + u64 size; \ + unsigned int seq; \ + \ + do { \ + seq = read_seqcount_begin(&dev->data_seqcount); \ + size = dev->name; \ + } while (read_seqcount_retry(&dev->data_seqcount, seq)); \ + return size; \ +} \ + \ +static inline void \ +btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ +{ \ + preempt_disable(); \ + write_seqcount_begin(&dev->data_seqcount); \ + dev->name = size; \ + write_seqcount_end(&dev->data_seqcount); \ + preempt_enable(); \ +} +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) +#define BTRFS_DEVICE_GETSET_FUNCS(name) \ +static inline u64 \ +btrfs_device_get_##name(const struct btrfs_device *dev) \ +{ \ + u64 size; \ + \ + preempt_disable(); \ + size = dev->name; \ + preempt_enable(); \ + return size; \ +} \ + \ +static inline void \ +btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ +{ \ + preempt_disable(); \ + dev->name = size; \ + preempt_enable(); \ +} +#else +#define BTRFS_DEVICE_GETSET_FUNCS(name) \ +static inline u64 \ +btrfs_device_get_##name(const struct btrfs_device *dev) \ +{ \ + return dev->name; \ +} \ + \ +static inline void \ +btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ +{ \ + dev->name = size; \ +} +#endif + +BTRFS_DEVICE_GETSET_FUNCS(total_bytes); +BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes); +BTRFS_DEVICE_GETSET_FUNCS(bytes_used); + +enum btrfs_chunk_allocation_policy { + BTRFS_CHUNK_ALLOC_REGULAR, + BTRFS_CHUNK_ALLOC_ZONED, +}; + +/* + * Read policies for mirrored block group profiles, read picks the stripe based + * on these policies. + */ +enum btrfs_read_policy { + /* Use process PID to choose the stripe */ + BTRFS_READ_POLICY_PID, + BTRFS_NR_READ_POLICY, +}; + +struct btrfs_fs_devices { + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + + /* + * UUID written into the btree blocks: + * + * - If metadata_uuid != fsid then super block must have + * BTRFS_FEATURE_INCOMPAT_METADATA_UUID flag set. + * + * - Following shall be true at all times: + * - metadata_uuid == btrfs_header::fsid + * - metadata_uuid == btrfs_dev_item::fsid + */ + u8 metadata_uuid[BTRFS_FSID_SIZE]; + + struct list_head fs_list; + + /* + * Number of devices under this fsid including missing and + * replace-target device and excludes seed devices. + */ + u64 num_devices; + + /* + * The number of devices that successfully opened, including + * replace-target, excludes seed devices. + */ + u64 open_devices; + + /* The number of devices that are under the chunk allocation list. */ + u64 rw_devices; + + /* Count of missing devices under this fsid excluding seed device. */ + u64 missing_devices; + u64 total_rw_bytes; + + /* + * Count of devices from btrfs_super_block::num_devices for this fsid, + * which includes the seed device, excludes the transient replace-target + * device. + */ + u64 total_devices; + + /* Highest generation number of seen devices */ + u64 latest_generation; + + /* + * The mount device or a device with highest generation after removal + * or replace. + */ + struct btrfs_device *latest_dev; + + /* + * All of the devices in the filesystem, protected by a mutex so we can + * safely walk it to write out the super blocks without worrying about + * adding/removing by the multi-device code. Scrubbing super block can + * kick off supers writing by holding this mutex lock. + */ + struct mutex device_list_mutex; + + /* List of all devices, protected by device_list_mutex */ + struct list_head devices; + + /* Devices which can satisfy space allocation. Protected by * chunk_mutex. */ + struct list_head alloc_list; + + struct list_head seed_list; + + /* Count fs-devices opened. */ + int opened; + + /* Set when we find or add a device that doesn't have the nonrot flag set. */ + bool rotating; + /* Devices support TRIM/discard commands. */ + bool discardable; + bool fsid_change; + /* The filesystem is a seed filesystem. */ + bool seeding; + + struct btrfs_fs_info *fs_info; + /* sysfs kobjects */ + struct kobject fsid_kobj; + struct kobject *devices_kobj; + struct kobject *devinfo_kobj; + struct completion kobj_unregister; + + enum btrfs_chunk_allocation_policy chunk_alloc_policy; + + /* Policy used to read the mirrored stripes. */ + enum btrfs_read_policy read_policy; +}; + +#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ + - sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + +#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ + - 2 * sizeof(struct btrfs_disk_key) \ + - 2 * sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + +struct btrfs_io_stripe { + struct btrfs_device *dev; + union { + /* Block mapping */ + u64 physical; + /* For the endio handler */ + struct btrfs_io_context *bioc; + }; +}; + +struct btrfs_discard_stripe { + struct btrfs_device *dev; + u64 physical; + u64 length; +}; + +/* + * Context for IO subsmission for device stripe. + * + * - Track the unfinished mirrors for mirror based profiles + * Mirror based profiles are SINGLE/DUP/RAID1/RAID10. + * + * - Contain the logical -> physical mapping info + * Used by submit_stripe_bio() for mapping logical bio + * into physical device address. + * + * - Contain device replace info + * Used by handle_ops_on_dev_replace() to copy logical bios + * into the new device. + * + * - Contain RAID56 full stripe logical bytenrs + */ +struct btrfs_io_context { + refcount_t refs; + struct btrfs_fs_info *fs_info; + u64 map_type; /* get from map_lookup->type */ + struct bio *orig_bio; + atomic_t error; + u16 max_errors; + + /* + * The total number of stripes, including the extra duplicated + * stripe for replace. + */ + u16 num_stripes; + + /* + * The mirror_num of this bioc. + * + * This is for reads which use 0 as mirror_num, thus we should return a + * valid mirror_num (>0) for the reader. + */ + u16 mirror_num; + + /* + * The following two members are for dev-replace case only. + * + * @replace_nr_stripes: Number of duplicated stripes which need to be + * written to replace target. + * Should be <= 2 (2 for DUP, otherwise <= 1). + * @replace_stripe_src: The array indicates where the duplicated stripes + * are from. + * + * The @replace_stripe_src[] array is mostly for RAID56 cases. + * As non-RAID56 stripes share the same contents of the mapped range, + * thus no need to bother where the duplicated ones are from. + * + * But for RAID56 case, all stripes contain different contents, thus + * we need a way to know the mapping. + * + * There is an example for the two members, using a RAID5 write: + * + * num_stripes: 4 (3 + 1 duplicated write) + * stripes[0]: dev = devid 1, physical = X + * stripes[1]: dev = devid 2, physical = Y + * stripes[2]: dev = devid 3, physical = Z + * stripes[3]: dev = devid 0, physical = Y + * + * replace_nr_stripes = 1 + * replace_stripe_src = 1 <- Means stripes[1] is involved in replace. + * The duplicated stripe index would be + * (@num_stripes - 1). + * + * Note, that we can still have cases replace_nr_stripes = 2 for DUP. + * In that case, all stripes share the same content, thus we don't + * need to bother @replace_stripe_src value at all. + */ + u16 replace_nr_stripes; + s16 replace_stripe_src; + /* + * Logical bytenr of the full stripe start, only for RAID56 cases. + * + * When this value is set to other than (u64)-1, the stripes[] should + * follow this pattern: + * + * (real_stripes = num_stripes - replace_nr_stripes) + * (data_stripes = (is_raid6) ? (real_stripes - 2) : (real_stripes - 1)) + * + * stripes[0]: The first data stripe + * stripes[1]: The second data stripe + * ... + * stripes[data_stripes - 1]: The last data stripe + * stripes[data_stripes]: The P stripe + * stripes[data_stripes + 1]: The Q stripe (only for RAID6). + */ + u64 full_stripe_logical; + struct btrfs_io_stripe stripes[]; +}; + +struct btrfs_device_info { + struct btrfs_device *dev; + u64 dev_offset; + u64 max_avail; + u64 total_avail; +}; + +struct btrfs_raid_attr { + u8 sub_stripes; /* sub_stripes info for map */ + u8 dev_stripes; /* stripes per dev */ + u8 devs_max; /* max devs to use */ + u8 devs_min; /* min devs needed */ + u8 tolerated_failures; /* max tolerated fail devs */ + u8 devs_increment; /* ndevs has to be a multiple of this */ + u8 ncopies; /* how many copies to data has */ + u8 nparity; /* number of stripes worth of bytes to store + * parity information */ + u8 mindev_error; /* error code if min devs requisite is unmet */ + const char raid_name[8]; /* name of the raid */ + u64 bg_flag; /* block group flag of the raid */ +}; + +extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; + +struct map_lookup { + u64 type; + int io_align; + int io_width; + int num_stripes; + int sub_stripes; + int verified_stripes; /* For mount time dev extent verification */ + struct btrfs_io_stripe stripes[]; +}; + +#define map_lookup_size(n) (sizeof(struct map_lookup) + \ + (sizeof(struct btrfs_io_stripe) * (n))) + +struct btrfs_balance_args; +struct btrfs_balance_progress; +struct btrfs_balance_control { + struct btrfs_balance_args data; + struct btrfs_balance_args meta; + struct btrfs_balance_args sys; + + u64 flags; + + struct btrfs_balance_progress stat; +}; + +/* + * Search for a given device by the set parameters + */ +struct btrfs_dev_lookup_args { + u64 devid; + u8 *uuid; + u8 *fsid; + bool missing; +}; + +/* We have to initialize to -1 because BTRFS_DEV_REPLACE_DEVID is 0 */ +#define BTRFS_DEV_LOOKUP_ARGS_INIT { .devid = (u64)-1 } + +#define BTRFS_DEV_LOOKUP_ARGS(name) \ + struct btrfs_dev_lookup_args name = BTRFS_DEV_LOOKUP_ARGS_INIT + +enum btrfs_map_op { + BTRFS_MAP_READ, + BTRFS_MAP_WRITE, + BTRFS_MAP_GET_READ_MIRRORS, +}; + +static inline enum btrfs_map_op btrfs_op(struct bio *bio) +{ + switch (bio_op(bio)) { + case REQ_OP_WRITE: + case REQ_OP_ZONE_APPEND: + return BTRFS_MAP_WRITE; + default: + WARN_ON_ONCE(1); + fallthrough; + case REQ_OP_READ: + return BTRFS_MAP_READ; + } +} + +static inline unsigned long btrfs_chunk_item_size(int num_stripes) +{ + ASSERT(num_stripes); + return sizeof(struct btrfs_chunk) + + sizeof(struct btrfs_stripe) * (num_stripes - 1); +} + +/* + * Do the type safe converstion from stripe_nr to offset inside the chunk. + * + * @stripe_nr is u32, with left shift it can overflow u32 for chunks larger + * than 4G. This does the proper type cast to avoid overflow. + */ +static inline u64 btrfs_stripe_nr_to_offset(u32 stripe_nr) +{ + return (u64)stripe_nr << BTRFS_STRIPE_LEN_SHIFT; +} + +void btrfs_get_bioc(struct btrfs_io_context *bioc); +void btrfs_put_bioc(struct btrfs_io_context *bioc); +int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_io_context **bioc_ret, + struct btrfs_io_stripe *smap, int *mirror_num_ret, + int need_raid_map); +int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, + struct btrfs_io_stripe *smap, u64 logical, + u32 length, int mirror_num); +struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length_ret, + u32 *num_stripes); +int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); +int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); +struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, + u64 type); +void btrfs_mapping_tree_free(struct extent_map_tree *tree); +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + blk_mode_t flags, void *holder); +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags); +int btrfs_forget_devices(dev_t devt); +void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); +void btrfs_assign_next_active_device(struct btrfs_device *device, + struct btrfs_device *this_dev); +struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, + u64 devid, + const char *devpath); +int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, + struct btrfs_dev_lookup_args *args, + const char *path); +struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, + const u64 *devid, const u8 *uuid, + const char *path); +void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); +int btrfs_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_dev_lookup_args *args, + struct block_device **bdev, void **holder); +void __exit btrfs_cleanup_fs_uuids(void); +int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); +int btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size); +struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, + const struct btrfs_dev_lookup_args *args); +int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); +int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); +int btrfs_balance(struct btrfs_fs_info *fs_info, + struct btrfs_balance_control *bctl, + struct btrfs_ioctl_balance_args *bargs); +void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf); +int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); +int btrfs_recover_balance(struct btrfs_fs_info *fs_info); +int btrfs_pause_balance(struct btrfs_fs_info *fs_info); +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); +int btrfs_uuid_scan_kthread(void *data); +bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset); +void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); +int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_get_dev_stats *stats); +int btrfs_init_devices_late(struct btrfs_fs_info *fs_info); +int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); +void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev); +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); +int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, + u64 logical, u64 len); +unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, + u64 logical); +u64 btrfs_calc_stripe_length(const struct extent_map *em); +int btrfs_nr_parity_stripes(u64 type); +int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, + struct btrfs_block_group *bg); +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); +struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); +void btrfs_release_disk_super(struct btrfs_super_block *super); + +static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, + int index) +{ + atomic_inc(dev->dev_stat_values + index); + /* + * This memory barrier orders stores updating statistics before stores + * updating dev_stats_ccnt. + * + * It pairs with smp_rmb() in btrfs_run_dev_stats(). + */ + smp_mb__before_atomic(); + atomic_inc(&dev->dev_stats_ccnt); +} + +static inline int btrfs_dev_stat_read(struct btrfs_device *dev, + int index) +{ + return atomic_read(dev->dev_stat_values + index); +} + +static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev, + int index) +{ + int ret; + + ret = atomic_xchg(dev->dev_stat_values + index, 0); + /* + * atomic_xchg implies a full memory barriers as per atomic_t.txt: + * - RMW operations that have a return value are fully ordered; + * + * This implicit memory barriers is paired with the smp_rmb in + * btrfs_run_dev_stats + */ + atomic_inc(&dev->dev_stats_ccnt); + return ret; +} + +static inline void btrfs_dev_stat_set(struct btrfs_device *dev, + int index, unsigned long val) +{ + atomic_set(dev->dev_stat_values + index, val); + /* + * This memory barrier orders stores updating statistics before stores + * updating dev_stats_ccnt. + * + * It pairs with smp_rmb() in btrfs_run_dev_stats(). + */ + smp_mb__before_atomic(); + atomic_inc(&dev->dev_stats_ccnt); +} + +static inline const char *btrfs_dev_name(const struct btrfs_device *device) +{ + if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) + return ""; + else + return rcu_str_deref(device->name); +} + +void btrfs_commit_device_sizes(struct btrfs_transaction *trans); + +struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); +bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, + struct btrfs_device *failing_dev); +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, + struct block_device *bdev, + const char *device_path); + +enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags); +int btrfs_bg_type_to_factor(u64 flags); +const char *btrfs_bg_type_to_raid_name(u64 flags); +int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); +bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); + +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); +u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb); + +#endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c new file mode 100644 index 0000000000..b906f80965 --- /dev/null +++ b/fs/btrfs/xattr.c @@ -0,0 +1,492 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "fs.h" +#include "messages.h" +#include "btrfs_inode.h" +#include "transaction.h" +#include "xattr.h" +#include "disk-io.h" +#include "props.h" +#include "locking.h" +#include "accessors.h" +#include "dir-item.h" + +int btrfs_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct btrfs_dir_item *di; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret = 0; + unsigned long data_ptr; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* lookup the xattr by name */ + di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)), + name, strlen(name), 0); + if (!di) { + ret = -ENODATA; + goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + leaf = path->nodes[0]; + /* if size is 0, that means we want the size of the attr */ + if (!size) { + ret = btrfs_dir_data_len(leaf, di); + goto out; + } + + /* now get the data out of our dir_item */ + if (btrfs_dir_data_len(leaf, di) > size) { + ret = -ERANGE; + goto out; + } + + /* + * The way things are packed into the leaf is like this + * |struct btrfs_dir_item|name|data| + * where name is the xattr name, so security.foo, and data is the + * content of the xattr. data_ptr points to the location in memory + * where the data starts in the in memory leaf + */ + data_ptr = (unsigned long)((char *)(di + 1) + + btrfs_dir_name_len(leaf, di)); + read_extent_buffer(leaf, buffer, data_ptr, + btrfs_dir_data_len(leaf, di)); + ret = btrfs_dir_data_len(leaf, di); + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, + const char *name, const void *value, size_t size, int flags) +{ + struct btrfs_dir_item *di = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + size_t name_len = strlen(name); + int ret = 0; + + ASSERT(trans); + + if (name_len + size > BTRFS_MAX_XATTR_SIZE(root->fs_info)) + return -ENOSPC; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->skip_release_on_error = 1; + + if (!value) { + di = btrfs_lookup_xattr(trans, root, path, + btrfs_ino(BTRFS_I(inode)), name, name_len, -1); + if (!di && (flags & XATTR_REPLACE)) + ret = -ENODATA; + else if (IS_ERR(di)) + ret = PTR_ERR(di); + else if (di) + ret = btrfs_delete_one_dir_name(trans, root, path, di); + goto out; + } + + /* + * For a replace we can't just do the insert blindly. + * Do a lookup first (read-only btrfs_search_slot), and return if xattr + * doesn't exist. If it exists, fall down below to the insert/replace + * path - we can't race with a concurrent xattr delete, because the VFS + * locks the inode's i_mutex before calling setxattr or removexattr. + */ + if (flags & XATTR_REPLACE) { + ASSERT(inode_is_locked(inode)); + di = btrfs_lookup_xattr(NULL, root, path, + btrfs_ino(BTRFS_I(inode)), name, name_len, 0); + if (!di) + ret = -ENODATA; + else if (IS_ERR(di)) + ret = PTR_ERR(di); + if (ret) + goto out; + btrfs_release_path(path); + di = NULL; + } + + ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(BTRFS_I(inode)), + name, name_len, value, size); + if (ret == -EOVERFLOW) { + /* + * We have an existing item in a leaf, split_leaf couldn't + * expand it. That item might have or not a dir_item that + * matches our target xattr, so lets check. + */ + ret = 0; + btrfs_assert_tree_write_locked(path->nodes[0]); + di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + if (!di && !(flags & XATTR_REPLACE)) { + ret = -ENOSPC; + goto out; + } + } else if (ret == -EEXIST) { + ret = 0; + di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + ASSERT(di); /* logic error */ + } else if (ret) { + goto out; + } + + if (di && (flags & XATTR_CREATE)) { + ret = -EEXIST; + goto out; + } + + if (di) { + /* + * We're doing a replace, and it must be atomic, that is, at + * any point in time we have either the old or the new xattr + * value in the tree. We don't want readers (getxattr and + * listxattrs) to miss a value, this is specially important + * for ACLs. + */ + const int slot = path->slots[0]; + struct extent_buffer *leaf = path->nodes[0]; + const u16 old_data_len = btrfs_dir_data_len(leaf, di); + const u32 item_size = btrfs_item_size(leaf, slot); + const u32 data_size = sizeof(*di) + name_len + size; + unsigned long data_ptr; + char *ptr; + + if (size > old_data_len) { + if (btrfs_leaf_free_space(leaf) < + (size - old_data_len)) { + ret = -ENOSPC; + goto out; + } + } + + if (old_data_len + name_len + sizeof(*di) == item_size) { + /* No other xattrs packed in the same leaf item. */ + if (size > old_data_len) + btrfs_extend_item(trans, path, size - old_data_len); + else if (size < old_data_len) + btrfs_truncate_item(trans, path, data_size, 1); + } else { + /* There are other xattrs packed in the same item. */ + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto out; + btrfs_extend_item(trans, path, data_size); + } + + ptr = btrfs_item_ptr(leaf, slot, char); + ptr += btrfs_item_size(leaf, slot) - data_size; + di = (struct btrfs_dir_item *)ptr; + btrfs_set_dir_data_len(leaf, di, size); + data_ptr = ((unsigned long)(di + 1)) + name_len; + write_extent_buffer(leaf, value, data_ptr, size); + btrfs_mark_buffer_dirty(trans, leaf); + } else { + /* + * Insert, and we had space for the xattr, so path->slots[0] is + * where our xattr dir_item is and btrfs_insert_xattr_item() + * filled it. + */ + } +out: + btrfs_free_path(path); + if (!ret) { + set_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_NO_XATTRS, &BTRFS_I(inode)->runtime_flags); + } + return ret; +} + +/* + * @value: "" makes the attribute to empty, NULL removes it + */ +int btrfs_setxattr_trans(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + const bool start_trans = (current->journal_info == NULL); + int ret; + + if (start_trans) { + /* + * 1 unit for inserting/updating/deleting the xattr + * 1 unit for the inode item update + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + } else { + /* + * This can happen when smack is enabled and a directory is being + * created. It happens through d_instantiate_new(), which calls + * smack_d_instantiate(), which in turn calls __vfs_setxattr() to + * set the transmute xattr (XATTR_NAME_SMACKTRANSMUTE) on the + * inode. We have already reserved space for the xattr and inode + * update at btrfs_mkdir(), so just use the transaction handle. + * We don't join or start a transaction, as that will reset the + * block_rsv of the handle and trigger a warning for the start + * case. + */ + ASSERT(strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN) == 0); + trans = current->journal_info; + } + + ret = btrfs_setxattr(trans, inode, name, value, size, flags); + if (ret) + goto out; + + inode_inc_iversion(inode); + inode_set_ctime_current(inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + if (ret) + btrfs_abort_transaction(trans, ret); +out: + if (start_trans) + btrfs_end_transaction(trans); + return ret; +} + +ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct btrfs_key found_key; + struct btrfs_key key; + struct inode *inode = d_inode(dentry); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + int iter_ret = 0; + int ret = 0; + size_t total_size = 0, size_left = size; + + /* + * ok we want all objects associated with this id. + * NOTE: we set key.offset = 0; because we want to start with the + * first xattr that we find and walk forward + */ + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = READA_FORWARD; + + /* search for our xattrs */ + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { + struct extent_buffer *leaf; + int slot; + struct btrfs_dir_item *di; + u32 item_size; + u32 cur; + + leaf = path->nodes[0]; + slot = path->slots[0]; + + /* check to make sure this item is what we want */ + if (found_key.objectid != key.objectid) + break; + if (found_key.type > BTRFS_XATTR_ITEM_KEY) + break; + if (found_key.type < BTRFS_XATTR_ITEM_KEY) + continue; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + item_size = btrfs_item_size(leaf, slot); + cur = 0; + while (cur < item_size) { + u16 name_len = btrfs_dir_name_len(leaf, di); + u16 data_len = btrfs_dir_data_len(leaf, di); + u32 this_len = sizeof(*di) + name_len + data_len; + unsigned long name_ptr = (unsigned long)(di + 1); + + total_size += name_len + 1; + /* + * We are just looking for how big our buffer needs to + * be. + */ + if (!size) + goto next; + + if (!buffer || (name_len + 1) > size_left) { + iter_ret = -ERANGE; + break; + } + + read_extent_buffer(leaf, buffer, name_ptr, name_len); + buffer[name_len] = '\0'; + + size_left -= name_len + 1; + buffer += name_len + 1; +next: + cur += this_len; + di = (struct btrfs_dir_item *)((char *)di + this_len); + } + } + + if (iter_ret < 0) + ret = iter_ret; + else + ret = total_size; + + btrfs_free_path(path); + + return ret; +} + +static int btrfs_xattr_handler_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + name = xattr_full_name(handler, name); + return btrfs_getxattr(inode, name, buffer, size); +} + +static int btrfs_xattr_handler_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) +{ + if (btrfs_root_readonly(BTRFS_I(inode)->root)) + return -EROFS; + + name = xattr_full_name(handler, name); + return btrfs_setxattr_trans(inode, name, buffer, size, flags); +} + +static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + int ret; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + + name = xattr_full_name(handler, name); + ret = btrfs_validate_prop(BTRFS_I(inode), name, value, size); + if (ret) + return ret; + + if (btrfs_ignore_prop(BTRFS_I(inode), name)) + return 0; + + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_set_prop(trans, inode, name, value, size, flags); + if (!ret) { + inode_inc_iversion(inode); + inode_set_ctime_current(inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + if (ret) + btrfs_abort_transaction(trans, ret); + } + + btrfs_end_transaction(trans); + + return ret; +} + +static const struct xattr_handler btrfs_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = btrfs_xattr_handler_get, + .set = btrfs_xattr_handler_set, +}; + +static const struct xattr_handler btrfs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = btrfs_xattr_handler_get, + .set = btrfs_xattr_handler_set, +}; + +static const struct xattr_handler btrfs_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .get = btrfs_xattr_handler_get, + .set = btrfs_xattr_handler_set, +}; + +static const struct xattr_handler btrfs_btrfs_xattr_handler = { + .prefix = XATTR_BTRFS_PREFIX, + .get = btrfs_xattr_handler_get, + .set = btrfs_xattr_handler_set_prop, +}; + +const struct xattr_handler *btrfs_xattr_handlers[] = { + &btrfs_security_xattr_handler, + &btrfs_trusted_xattr_handler, + &btrfs_user_xattr_handler, + &btrfs_btrfs_xattr_handler, + NULL, +}; + +static int btrfs_initxattrs(struct inode *inode, + const struct xattr *xattr_array, void *fs_private) +{ + struct btrfs_trans_handle *trans = fs_private; + const struct xattr *xattr; + unsigned int nofs_flag; + char *name; + int err = 0; + + /* + * We're holding a transaction handle, so use a NOFS memory allocation + * context to avoid deadlock if reclaim happens. + */ + nofs_flag = memalloc_nofs_save(); + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1, GFP_KERNEL); + if (!name) { + err = -ENOMEM; + break; + } + strcpy(name, XATTR_SECURITY_PREFIX); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + err = btrfs_setxattr(trans, inode, name, xattr->value, + xattr->value_len, 0); + kfree(name); + if (err < 0) + break; + } + memalloc_nofs_restore(nofs_flag); + return err; +} + +int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &btrfs_initxattrs, trans); +} diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h new file mode 100644 index 0000000000..1cd3fc0a8f --- /dev/null +++ b/fs/btrfs/xattr.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + */ + +#ifndef BTRFS_XATTR_H +#define BTRFS_XATTR_H + +#include + +extern const struct xattr_handler *btrfs_xattr_handlers[]; + +int btrfs_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size); +int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, + const char *name, const void *value, size_t size, int flags); +int btrfs_setxattr_trans(struct inode *inode, const char *name, + const void *value, size_t size, int flags); +ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); + +int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir, + const struct qstr *qstr); + +#endif diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c new file mode 100644 index 0000000000..6c231a116a --- /dev/null +++ b/fs/btrfs/zlib.c @@ -0,0 +1,454 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * Based on jffs2 zlib code: + * Copyright © 2001-2007 Red Hat, Inc. + * Created by David Woodhouse + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compression.h" + +/* workspace buffer size for s390 zlib hardware support */ +#define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE) + +struct workspace { + z_stream strm; + char *buf; + unsigned int buf_size; + struct list_head list; + int level; +}; + +static struct workspace_manager wsm; + +struct list_head *zlib_get_workspace(unsigned int level) +{ + struct list_head *ws = btrfs_get_workspace(BTRFS_COMPRESS_ZLIB, level); + struct workspace *workspace = list_entry(ws, struct workspace, list); + + workspace->level = level; + + return ws; +} + +void zlib_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + + kvfree(workspace->strm.workspace); + kfree(workspace->buf); + kfree(workspace); +} + +struct list_head *zlib_alloc_workspace(unsigned int level) +{ + struct workspace *workspace; + int workspacesize; + + workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); + if (!workspace) + return ERR_PTR(-ENOMEM); + + workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + zlib_inflate_workspacesize()); + workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL | __GFP_NOWARN); + workspace->level = level; + workspace->buf = NULL; + /* + * In case of s390 zlib hardware support, allocate lager workspace + * buffer. If allocator fails, fall back to a single page buffer. + */ + if (zlib_deflate_dfltcc_enabled()) { + workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE, + __GFP_NOMEMALLOC | __GFP_NORETRY | + __GFP_NOWARN | GFP_NOIO); + workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE; + } + if (!workspace->buf) { + workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + workspace->buf_size = PAGE_SIZE; + } + if (!workspace->strm.workspace || !workspace->buf) + goto fail; + + INIT_LIST_HEAD(&workspace->list); + + return &workspace->list; +fail: + zlib_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); +} + +int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret; + char *data_in = NULL; + char *cpage_out; + int nr_pages = 0; + struct page *in_page = NULL; + struct page *out_page = NULL; + unsigned long bytes_left; + unsigned int in_buf_pages; + unsigned long len = *total_out; + unsigned long nr_dest_pages = *out_pages; + const unsigned long max_out = nr_dest_pages * PAGE_SIZE; + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + if (Z_OK != zlib_deflateInit(&workspace->strm, workspace->level)) { + pr_warn("BTRFS: deflateInit failed\n"); + ret = -EIO; + goto out; + } + + workspace->strm.total_in = 0; + workspace->strm.total_out = 0; + + out_page = alloc_page(GFP_NOFS); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = page_address(out_page); + pages[0] = out_page; + nr_pages = 1; + + workspace->strm.next_in = workspace->buf; + workspace->strm.avail_in = 0; + workspace->strm.next_out = cpage_out; + workspace->strm.avail_out = PAGE_SIZE; + + while (workspace->strm.total_in < len) { + /* + * Get next input pages and copy the contents to + * the workspace buffer if required. + */ + if (workspace->strm.avail_in == 0) { + bytes_left = len - workspace->strm.total_in; + in_buf_pages = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), + workspace->buf_size / PAGE_SIZE); + if (in_buf_pages > 1) { + int i; + + for (i = 0; i < in_buf_pages; i++) { + if (data_in) { + kunmap_local(data_in); + put_page(in_page); + } + in_page = find_get_page(mapping, + start >> PAGE_SHIFT); + data_in = kmap_local_page(in_page); + copy_page(workspace->buf + i * PAGE_SIZE, + data_in); + start += PAGE_SIZE; + } + workspace->strm.next_in = workspace->buf; + } else { + if (data_in) { + kunmap_local(data_in); + put_page(in_page); + } + in_page = find_get_page(mapping, + start >> PAGE_SHIFT); + data_in = kmap_local_page(in_page); + start += PAGE_SIZE; + workspace->strm.next_in = data_in; + } + workspace->strm.avail_in = min(bytes_left, + (unsigned long) workspace->buf_size); + } + + ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); + if (ret != Z_OK) { + pr_debug("BTRFS: deflate in loop returned %d\n", + ret); + zlib_deflateEnd(&workspace->strm); + ret = -EIO; + goto out; + } + + /* we're making it bigger, give up */ + if (workspace->strm.total_in > 8192 && + workspace->strm.total_in < + workspace->strm.total_out) { + ret = -E2BIG; + goto out; + } + /* we need another page for writing out. Test this + * before the total_in so we will pull in a new page for + * the stream end if required + */ + if (workspace->strm.avail_out == 0) { + if (nr_pages == nr_dest_pages) { + ret = -E2BIG; + goto out; + } + out_page = alloc_page(GFP_NOFS); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = page_address(out_page); + pages[nr_pages] = out_page; + nr_pages++; + workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.next_out = cpage_out; + } + /* we're all done */ + if (workspace->strm.total_in >= len) + break; + if (workspace->strm.total_out > max_out) + break; + } + workspace->strm.avail_in = 0; + /* + * Call deflate with Z_FINISH flush parameter providing more output + * space but no more input data, until it returns with Z_STREAM_END. + */ + while (ret != Z_STREAM_END) { + ret = zlib_deflate(&workspace->strm, Z_FINISH); + if (ret == Z_STREAM_END) + break; + if (ret != Z_OK && ret != Z_BUF_ERROR) { + zlib_deflateEnd(&workspace->strm); + ret = -EIO; + goto out; + } else if (workspace->strm.avail_out == 0) { + /* get another page for the stream end */ + if (nr_pages == nr_dest_pages) { + ret = -E2BIG; + goto out; + } + out_page = alloc_page(GFP_NOFS); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = page_address(out_page); + pages[nr_pages] = out_page; + nr_pages++; + workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.next_out = cpage_out; + } + } + zlib_deflateEnd(&workspace->strm); + + if (workspace->strm.total_out >= workspace->strm.total_in) { + ret = -E2BIG; + goto out; + } + + ret = 0; + *total_out = workspace->strm.total_out; + *total_in = workspace->strm.total_in; +out: + *out_pages = nr_pages; + if (data_in) { + kunmap_local(data_in); + put_page(in_page); + } + + return ret; +} + +int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0, ret2; + int wbits = MAX_WBITS; + char *data_in; + size_t total_out = 0; + unsigned long page_in_index = 0; + size_t srclen = cb->compressed_len; + unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long buf_start; + struct page **pages_in = cb->compressed_pages; + + data_in = kmap_local_page(pages_in[page_in_index]); + workspace->strm.next_in = data_in; + workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); + workspace->strm.total_in = 0; + + workspace->strm.total_out = 0; + workspace->strm.next_out = workspace->buf; + workspace->strm.avail_out = workspace->buf_size; + + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->strm.next_in += 2; + workspace->strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { + pr_warn("BTRFS: inflateInit failed\n"); + kunmap_local(data_in); + return -EIO; + } + while (workspace->strm.total_in < srclen) { + ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) + break; + + buf_start = total_out; + total_out = workspace->strm.total_out; + + /* we didn't make progress in this inflate call, we're done */ + if (buf_start == total_out) + break; + + ret2 = btrfs_decompress_buf2page(workspace->buf, + total_out - buf_start, cb, buf_start); + if (ret2 == 0) { + ret = 0; + goto done; + } + + workspace->strm.next_out = workspace->buf; + workspace->strm.avail_out = workspace->buf_size; + + if (workspace->strm.avail_in == 0) { + unsigned long tmp; + kunmap_local(data_in); + page_in_index++; + if (page_in_index >= total_pages_in) { + data_in = NULL; + break; + } + data_in = kmap_local_page(pages_in[page_in_index]); + workspace->strm.next_in = data_in; + tmp = srclen - workspace->strm.total_in; + workspace->strm.avail_in = min(tmp, PAGE_SIZE); + } + } + if (ret != Z_STREAM_END) + ret = -EIO; + else + ret = 0; +done: + zlib_inflateEnd(&workspace->strm); + if (data_in) + kunmap_local(data_in); + return ret; +} + +int zlib_decompress(struct list_head *ws, const u8 *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0; + int wbits = MAX_WBITS; + unsigned long bytes_left; + unsigned long total_out = 0; + unsigned long pg_offset = 0; + + destlen = min_t(unsigned long, destlen, PAGE_SIZE); + bytes_left = destlen; + + workspace->strm.next_in = data_in; + workspace->strm.avail_in = srclen; + workspace->strm.total_in = 0; + + workspace->strm.next_out = workspace->buf; + workspace->strm.avail_out = workspace->buf_size; + workspace->strm.total_out = 0; + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->strm.next_in += 2; + workspace->strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { + pr_warn("BTRFS: inflateInit failed\n"); + return -EIO; + } + + while (bytes_left > 0) { + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + + ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) + break; + + buf_start = total_out; + total_out = workspace->strm.total_out; + + if (total_out == buf_start) { + ret = -EIO; + break; + } + + if (total_out <= start_byte) + goto next; + + if (total_out > start_byte && buf_start < start_byte) + buf_offset = start_byte - buf_start; + else + buf_offset = 0; + + bytes = min(PAGE_SIZE - pg_offset, + PAGE_SIZE - (buf_offset % PAGE_SIZE)); + bytes = min(bytes, bytes_left); + + memcpy_to_page(dest_page, pg_offset, + workspace->buf + buf_offset, bytes); + + pg_offset += bytes; + bytes_left -= bytes; +next: + workspace->strm.next_out = workspace->buf; + workspace->strm.avail_out = workspace->buf_size; + } + + if (ret != Z_STREAM_END && bytes_left != 0) + ret = -EIO; + else + ret = 0; + + zlib_inflateEnd(&workspace->strm); + + /* + * this should only happen if zlib returned fewer bytes than we + * expected. btrfs_get_block is responsible for zeroing from the + * end of the inline extent (destlen) to the end of the page + */ + if (pg_offset < destlen) { + memzero_page(dest_page, pg_offset, destlen - pg_offset); + } + return ret; +} + +const struct btrfs_compress_op btrfs_zlib_compress = { + .workspace_manager = &wsm, + .max_level = 9, + .default_level = BTRFS_ZLIB_DEFAULT_LEVEL, +}; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c new file mode 100644 index 0000000000..41a8cdce5d --- /dev/null +++ b/fs/btrfs/zoned.c @@ -0,0 +1,2533 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "volumes.h" +#include "zoned.h" +#include "rcu-string.h" +#include "disk-io.h" +#include "block-group.h" +#include "transaction.h" +#include "dev-replace.h" +#include "space-info.h" +#include "super.h" +#include "fs.h" +#include "accessors.h" +#include "bio.h" + +/* Maximum number of zones to report per blkdev_report_zones() call */ +#define BTRFS_REPORT_NR_ZONES 4096 +/* Invalid allocation pointer value for missing devices */ +#define WP_MISSING_DEV ((u64)-1) +/* Pseudo write pointer value for conventional zone */ +#define WP_CONVENTIONAL ((u64)-2) + +/* + * Location of the first zone of superblock logging zone pairs. + * + * - primary superblock: 0B (zone 0) + * - first copy: 512G (zone starting at that offset) + * - second copy: 4T (zone starting at that offset) + */ +#define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL) +#define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) +#define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) + +#define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) +#define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) + +/* Number of superblock log zones */ +#define BTRFS_NR_SB_LOG_ZONES 2 + +/* + * Minimum of active zones we need: + * + * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors + * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group + * - 1 zone for tree-log dedicated block group + * - 1 zone for relocation + */ +#define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5) + +/* + * Minimum / maximum supported zone size. Currently, SMR disks have a zone + * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. + * We do not expect the zone size to become larger than 8GiB or smaller than + * 4MiB in the near future. + */ +#define BTRFS_MAX_ZONE_SIZE SZ_8G +#define BTRFS_MIN_ZONE_SIZE SZ_4M + +#define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) + +static void wait_eb_writebacks(struct btrfs_block_group *block_group); +static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written); + +static inline bool sb_zone_is_full(const struct blk_zone *zone) +{ + return (zone->cond == BLK_ZONE_COND_FULL) || + (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity); +} + +static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) +{ + struct blk_zone *zones = data; + + memcpy(&zones[idx], zone, sizeof(*zone)); + + return 0; +} + +static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, + u64 *wp_ret) +{ + bool empty[BTRFS_NR_SB_LOG_ZONES]; + bool full[BTRFS_NR_SB_LOG_ZONES]; + sector_t sector; + int i; + + for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { + ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL); + empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY); + full[i] = sb_zone_is_full(&zones[i]); + } + + /* + * Possible states of log buffer zones + * + * Empty[0] In use[0] Full[0] + * Empty[1] * 0 1 + * In use[1] x x 1 + * Full[1] 0 0 C + * + * Log position: + * *: Special case, no superblock is written + * 0: Use write pointer of zones[0] + * 1: Use write pointer of zones[1] + * C: Compare super blocks from zones[0] and zones[1], use the latest + * one determined by generation + * x: Invalid state + */ + + if (empty[0] && empty[1]) { + /* Special case to distinguish no superblock to read */ + *wp_ret = zones[0].start << SECTOR_SHIFT; + return -ENOENT; + } else if (full[0] && full[1]) { + /* Compare two super blocks */ + struct address_space *mapping = bdev->bd_inode->i_mapping; + struct page *page[BTRFS_NR_SB_LOG_ZONES]; + struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES]; + int i; + + for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { + u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT; + u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) - + BTRFS_SUPER_INFO_SIZE; + + page[i] = read_cache_page_gfp(mapping, + bytenr >> PAGE_SHIFT, GFP_NOFS); + if (IS_ERR(page[i])) { + if (i == 1) + btrfs_release_disk_super(super[0]); + return PTR_ERR(page[i]); + } + super[i] = page_address(page[i]); + } + + if (btrfs_super_generation(super[0]) > + btrfs_super_generation(super[1])) + sector = zones[1].start; + else + sector = zones[0].start; + + for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) + btrfs_release_disk_super(super[i]); + } else if (!full[0] && (empty[1] || full[1])) { + sector = zones[0].wp; + } else if (full[0]) { + sector = zones[1].wp; + } else { + return -EUCLEAN; + } + *wp_ret = sector << SECTOR_SHIFT; + return 0; +} + +/* + * Get the first zone number of the superblock mirror + */ +static inline u32 sb_zone_number(int shift, int mirror) +{ + u64 zone = U64_MAX; + + ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); + switch (mirror) { + case 0: zone = 0; break; + case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; + case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; + } + + ASSERT(zone <= U32_MAX); + + return (u32)zone; +} + +static inline sector_t zone_start_sector(u32 zone_number, + struct block_device *bdev) +{ + return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev)); +} + +static inline u64 zone_start_physical(u32 zone_number, + struct btrfs_zoned_device_info *zone_info) +{ + return (u64)zone_number << zone_info->zone_size_shift; +} + +/* + * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block + * device into static sized chunks and fake a conventional zone on each of + * them. + */ +static int emulate_report_zones(struct btrfs_device *device, u64 pos, + struct blk_zone *zones, unsigned int nr_zones) +{ + const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT; + sector_t bdev_size = bdev_nr_sectors(device->bdev); + unsigned int i; + + pos >>= SECTOR_SHIFT; + for (i = 0; i < nr_zones; i++) { + zones[i].start = i * zone_sectors + pos; + zones[i].len = zone_sectors; + zones[i].capacity = zone_sectors; + zones[i].wp = zones[i].start + zone_sectors; + zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL; + zones[i].cond = BLK_ZONE_COND_NOT_WP; + + if (zones[i].wp >= bdev_size) { + i++; + break; + } + } + + return i; +} + +static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, + struct blk_zone *zones, unsigned int *nr_zones) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + int ret; + + if (!*nr_zones) + return 0; + + if (!bdev_is_zoned(device->bdev)) { + ret = emulate_report_zones(device, pos, zones, *nr_zones); + *nr_zones = ret; + return 0; + } + + /* Check cache */ + if (zinfo->zone_cache) { + unsigned int i; + u32 zno; + + ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); + zno = pos >> zinfo->zone_size_shift; + /* + * We cannot report zones beyond the zone end. So, it is OK to + * cap *nr_zones to at the end. + */ + *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno); + + for (i = 0; i < *nr_zones; i++) { + struct blk_zone *zone_info; + + zone_info = &zinfo->zone_cache[zno + i]; + if (!zone_info->len) + break; + } + + if (i == *nr_zones) { + /* Cache hit on all the zones */ + memcpy(zones, zinfo->zone_cache + zno, + sizeof(*zinfo->zone_cache) * *nr_zones); + return 0; + } + } + + ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, + copy_zone_info_cb, zones); + if (ret < 0) { + btrfs_err_in_rcu(device->fs_info, + "zoned: failed to read zone %llu on %s (devid %llu)", + pos, rcu_str_deref(device->name), + device->devid); + return ret; + } + *nr_zones = ret; + if (!ret) + return -EIO; + + /* Populate cache */ + if (zinfo->zone_cache) { + u32 zno = pos >> zinfo->zone_size_shift; + + memcpy(zinfo->zone_cache + zno, zones, + sizeof(*zinfo->zone_cache) * *nr_zones); + } + + return 0; +} + +/* The emulated zone size is determined from the size of device extent */ +static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_path *path; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_dev_extent *dext; + int ret = 0; + + key.objectid = 1; + key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + /* No dev extents at all? Not good */ + if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + + leaf = path->nodes[0]; + dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); + fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); + ret = 0; + +out: + btrfs_free_path(path); + + return ret; +} + +int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + int ret = 0; + + /* fs_info->zone_size might not set yet. Use the incomapt flag here. */ + if (!btrfs_fs_incompat(fs_info, ZONED)) + return 0; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + /* We can skip reading of zone info for missing devices */ + if (!device->bdev) + continue; + + ret = btrfs_get_dev_zone_info(device, true); + if (ret) + break; + } + mutex_unlock(&fs_devices->device_list_mutex); + + return ret; +} + +int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) +{ + struct btrfs_fs_info *fs_info = device->fs_info; + struct btrfs_zoned_device_info *zone_info = NULL; + struct block_device *bdev = device->bdev; + unsigned int max_active_zones; + unsigned int nactive; + sector_t nr_sectors; + sector_t sector = 0; + struct blk_zone *zones = NULL; + unsigned int i, nreported = 0, nr_zones; + sector_t zone_sectors; + char *model, *emulated; + int ret; + + /* + * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not + * yet be set. + */ + if (!btrfs_fs_incompat(fs_info, ZONED)) + return 0; + + if (device->zone_info) + return 0; + + zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL); + if (!zone_info) + return -ENOMEM; + + device->zone_info = zone_info; + + if (!bdev_is_zoned(bdev)) { + if (!fs_info->zone_size) { + ret = calculate_emulated_zone_size(fs_info); + if (ret) + goto out; + } + + ASSERT(fs_info->zone_size); + zone_sectors = fs_info->zone_size >> SECTOR_SHIFT; + } else { + zone_sectors = bdev_zone_sectors(bdev); + } + + ASSERT(is_power_of_two_u64(zone_sectors)); + zone_info->zone_size = zone_sectors << SECTOR_SHIFT; + + /* We reject devices with a zone size larger than 8GB */ + if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { + btrfs_err_in_rcu(fs_info, + "zoned: %s: zone size %llu larger than supported maximum %llu", + rcu_str_deref(device->name), + zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); + ret = -EINVAL; + goto out; + } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) { + btrfs_err_in_rcu(fs_info, + "zoned: %s: zone size %llu smaller than supported minimum %u", + rcu_str_deref(device->name), + zone_info->zone_size, BTRFS_MIN_ZONE_SIZE); + ret = -EINVAL; + goto out; + } + + nr_sectors = bdev_nr_sectors(bdev); + zone_info->zone_size_shift = ilog2(zone_info->zone_size); + zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); + if (!IS_ALIGNED(nr_sectors, zone_sectors)) + zone_info->nr_zones++; + + max_active_zones = bdev_max_active_zones(bdev); + if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { + btrfs_err_in_rcu(fs_info, +"zoned: %s: max active zones %u is too small, need at least %u active zones", + rcu_str_deref(device->name), max_active_zones, + BTRFS_MIN_ACTIVE_ZONES); + ret = -EINVAL; + goto out; + } + zone_info->max_active_zones = max_active_zones; + + zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->seq_zones) { + ret = -ENOMEM; + goto out; + } + + zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->empty_zones) { + ret = -ENOMEM; + goto out; + } + + zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->active_zones) { + ret = -ENOMEM; + goto out; + } + + zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); + if (!zones) { + ret = -ENOMEM; + goto out; + } + + /* + * Enable zone cache only for a zoned device. On a non-zoned device, we + * fill the zone info with emulated CONVENTIONAL zones, so no need to + * use the cache. + */ + if (populate_cache && bdev_is_zoned(device->bdev)) { + zone_info->zone_cache = vcalloc(zone_info->nr_zones, + sizeof(struct blk_zone)); + if (!zone_info->zone_cache) { + btrfs_err_in_rcu(device->fs_info, + "zoned: failed to allocate zone cache for %s", + rcu_str_deref(device->name)); + ret = -ENOMEM; + goto out; + } + } + + /* Get zones type */ + nactive = 0; + while (sector < nr_sectors) { + nr_zones = BTRFS_REPORT_NR_ZONES; + ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, + &nr_zones); + if (ret) + goto out; + + for (i = 0; i < nr_zones; i++) { + if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) + __set_bit(nreported, zone_info->seq_zones); + switch (zones[i].cond) { + case BLK_ZONE_COND_EMPTY: + __set_bit(nreported, zone_info->empty_zones); + break; + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + __set_bit(nreported, zone_info->active_zones); + nactive++; + break; + } + nreported++; + } + sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; + } + + if (nreported != zone_info->nr_zones) { + btrfs_err_in_rcu(device->fs_info, + "inconsistent number of zones on %s (%u/%u)", + rcu_str_deref(device->name), nreported, + zone_info->nr_zones); + ret = -EIO; + goto out; + } + + if (max_active_zones) { + if (nactive > max_active_zones) { + btrfs_err_in_rcu(device->fs_info, + "zoned: %u active zones on %s exceeds max_active_zones %u", + nactive, rcu_str_deref(device->name), + max_active_zones); + ret = -EIO; + goto out; + } + atomic_set(&zone_info->active_zones_left, + max_active_zones - nactive); + set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags); + } + + /* Validate superblock log */ + nr_zones = BTRFS_NR_SB_LOG_ZONES; + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + u32 sb_zone; + u64 sb_wp; + int sb_pos = BTRFS_NR_SB_LOG_ZONES * i; + + sb_zone = sb_zone_number(zone_info->zone_size_shift, i); + if (sb_zone + 1 >= zone_info->nr_zones) + continue; + + ret = btrfs_get_dev_zones(device, + zone_start_physical(sb_zone, zone_info), + &zone_info->sb_zones[sb_pos], + &nr_zones); + if (ret) + goto out; + + if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { + btrfs_err_in_rcu(device->fs_info, + "zoned: failed to read super block log zone info at devid %llu zone %u", + device->devid, sb_zone); + ret = -EUCLEAN; + goto out; + } + + /* + * If zones[0] is conventional, always use the beginning of the + * zone to record superblock. No need to validate in that case. + */ + if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == + BLK_ZONE_TYPE_CONVENTIONAL) + continue; + + ret = sb_write_pointer(device->bdev, + &zone_info->sb_zones[sb_pos], &sb_wp); + if (ret != -ENOENT && ret) { + btrfs_err_in_rcu(device->fs_info, + "zoned: super block log zone corrupted devid %llu zone %u", + device->devid, sb_zone); + ret = -EUCLEAN; + goto out; + } + } + + + kvfree(zones); + + switch (bdev_zoned_model(bdev)) { + case BLK_ZONED_HM: + model = "host-managed zoned"; + emulated = ""; + break; + case BLK_ZONED_HA: + model = "host-aware zoned"; + emulated = ""; + break; + case BLK_ZONED_NONE: + model = "regular"; + emulated = "emulated "; + break; + default: + /* Just in case */ + btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s", + bdev_zoned_model(bdev), + rcu_str_deref(device->name)); + ret = -EOPNOTSUPP; + goto out_free_zone_info; + } + + btrfs_info_in_rcu(fs_info, + "%s block device %s, %u %szones of %llu bytes", + model, rcu_str_deref(device->name), zone_info->nr_zones, + emulated, zone_info->zone_size); + + return 0; + +out: + kvfree(zones); +out_free_zone_info: + btrfs_destroy_dev_zone_info(device); + + return ret; +} + +void btrfs_destroy_dev_zone_info(struct btrfs_device *device) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + + if (!zone_info) + return; + + bitmap_free(zone_info->active_zones); + bitmap_free(zone_info->seq_zones); + bitmap_free(zone_info->empty_zones); + vfree(zone_info->zone_cache); + kfree(zone_info); + device->zone_info = NULL; +} + +struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev) +{ + struct btrfs_zoned_device_info *zone_info; + + zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL); + if (!zone_info) + return NULL; + + zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->seq_zones) + goto out; + + bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones, + zone_info->nr_zones); + + zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->empty_zones) + goto out; + + bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones, + zone_info->nr_zones); + + zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->active_zones) + goto out; + + bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones, + zone_info->nr_zones); + zone_info->zone_cache = NULL; + + return zone_info; + +out: + bitmap_free(zone_info->seq_zones); + bitmap_free(zone_info->empty_zones); + bitmap_free(zone_info->active_zones); + kfree(zone_info); + return NULL; +} + +int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, + struct blk_zone *zone) +{ + unsigned int nr_zones = 1; + int ret; + + ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones); + if (ret != 0 || !nr_zones) + return ret ? ret : -EIO; + + return 0; +} + +static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) +{ + struct btrfs_device *device; + + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + if (device->bdev && + bdev_zoned_model(device->bdev) == BLK_ZONED_HM) { + btrfs_err(fs_info, + "zoned: mode not enabled but zoned device found: %pg", + device->bdev); + return -EINVAL; + } + } + + return 0; +} + +int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) +{ + struct queue_limits *lim = &fs_info->limits; + struct btrfs_device *device; + u64 zone_size = 0; + int ret; + + /* + * Host-Managed devices can't be used without the ZONED flag. With the + * ZONED all devices can be used, using zone emulation if required. + */ + if (!btrfs_fs_incompat(fs_info, ZONED)) + return btrfs_check_for_zoned_device(fs_info); + + blk_set_stacking_limits(lim); + + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + struct btrfs_zoned_device_info *zone_info = device->zone_info; + + if (!device->bdev) + continue; + + if (!zone_size) { + zone_size = zone_info->zone_size; + } else if (zone_info->zone_size != zone_size) { + btrfs_err(fs_info, + "zoned: unequal block device zone sizes: have %llu found %llu", + zone_info->zone_size, zone_size); + return -EINVAL; + } + + /* + * With the zoned emulation, we can have non-zoned device on the + * zoned mode. In this case, we don't have a valid max zone + * append size. + */ + if (bdev_is_zoned(device->bdev)) { + blk_stack_limits(lim, + &bdev_get_queue(device->bdev)->limits, + 0); + } + } + + /* + * stripe_size is always aligned to BTRFS_STRIPE_LEN in + * btrfs_create_chunk(). Since we want stripe_len == zone_size, + * check the alignment here. + */ + if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { + btrfs_err(fs_info, + "zoned: zone size %llu not aligned to stripe %u", + zone_size, BTRFS_STRIPE_LEN); + return -EINVAL; + } + + if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { + btrfs_err(fs_info, "zoned: mixed block groups not supported"); + return -EINVAL; + } + + fs_info->zone_size = zone_size; + /* + * Also limit max_zone_append_size by max_segments * PAGE_SIZE. + * Technically, we can have multiple pages per segment. But, since + * we add the pages one by one to a bio, and cannot increase the + * metadata reservation even if it increases the number of extents, it + * is safe to stick with the limit. + */ + fs_info->max_zone_append_size = ALIGN_DOWN( + min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT, + (u64)lim->max_sectors << SECTOR_SHIFT, + (u64)lim->max_segments << PAGE_SHIFT), + fs_info->sectorsize); + fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; + if (fs_info->max_zone_append_size < fs_info->max_extent_size) + fs_info->max_extent_size = fs_info->max_zone_append_size; + + /* + * Check mount options here, because we might change fs_info->zoned + * from fs_info->zone_size. + */ + ret = btrfs_check_mountopts_zoned(fs_info); + if (ret) + return ret; + + btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); + return 0; +} + +int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) +{ + if (!btrfs_is_zoned(info)) + return 0; + + /* + * Space cache writing is not COWed. Disable that to avoid write errors + * in sequential zones. + */ + if (btrfs_test_opt(info, SPACE_CACHE)) { + btrfs_err(info, "zoned: space cache v1 is not supported"); + return -EINVAL; + } + + if (btrfs_test_opt(info, NODATACOW)) { + btrfs_err(info, "zoned: NODATACOW not supported"); + return -EINVAL; + } + + btrfs_clear_and_info(info, DISCARD_ASYNC, + "zoned: async discard ignored and disabled for zoned mode"); + + return 0; +} + +static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, + int rw, u64 *bytenr_ret) +{ + u64 wp; + int ret; + + if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) { + *bytenr_ret = zones[0].start << SECTOR_SHIFT; + return 0; + } + + ret = sb_write_pointer(bdev, zones, &wp); + if (ret != -ENOENT && ret < 0) + return ret; + + if (rw == WRITE) { + struct blk_zone *reset = NULL; + + if (wp == zones[0].start << SECTOR_SHIFT) + reset = &zones[0]; + else if (wp == zones[1].start << SECTOR_SHIFT) + reset = &zones[1]; + + if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { + ASSERT(sb_zone_is_full(reset)); + + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + reset->start, reset->len, + GFP_NOFS); + if (ret) + return ret; + + reset->cond = BLK_ZONE_COND_EMPTY; + reset->wp = reset->start; + } + } else if (ret != -ENOENT) { + /* + * For READ, we want the previous one. Move write pointer to + * the end of a zone, if it is at the head of a zone. + */ + u64 zone_end = 0; + + if (wp == zones[0].start << SECTOR_SHIFT) + zone_end = zones[1].start + zones[1].capacity; + else if (wp == zones[1].start << SECTOR_SHIFT) + zone_end = zones[0].start + zones[0].capacity; + if (zone_end) + wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT, + BTRFS_SUPER_INFO_SIZE); + + wp -= BTRFS_SUPER_INFO_SIZE; + } + + *bytenr_ret = wp; + return 0; + +} + +int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, + u64 *bytenr_ret) +{ + struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES]; + sector_t zone_sectors; + u32 sb_zone; + int ret; + u8 zone_sectors_shift; + sector_t nr_sectors; + u32 nr_zones; + + if (!bdev_is_zoned(bdev)) { + *bytenr_ret = btrfs_sb_offset(mirror); + return 0; + } + + ASSERT(rw == READ || rw == WRITE); + + zone_sectors = bdev_zone_sectors(bdev); + if (!is_power_of_2(zone_sectors)) + return -EINVAL; + zone_sectors_shift = ilog2(zone_sectors); + nr_sectors = bdev_nr_sectors(bdev); + nr_zones = nr_sectors >> zone_sectors_shift; + + sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); + if (sb_zone + 1 >= nr_zones) + return -ENOENT; + + ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev), + BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, + zones); + if (ret < 0) + return ret; + if (ret != BTRFS_NR_SB_LOG_ZONES) + return -EIO; + + return sb_log_location(bdev, zones, rw, bytenr_ret); +} + +int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, + u64 *bytenr_ret) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + u32 zone_num; + + /* + * For a zoned filesystem on a non-zoned block device, use the same + * super block locations as regular filesystem. Doing so, the super + * block can always be retrieved and the zoned flag of the volume + * detected from the super block information. + */ + if (!bdev_is_zoned(device->bdev)) { + *bytenr_ret = btrfs_sb_offset(mirror); + return 0; + } + + zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); + if (zone_num + 1 >= zinfo->nr_zones) + return -ENOENT; + + return sb_log_location(device->bdev, + &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror], + rw, bytenr_ret); +} + +static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo, + int mirror) +{ + u32 zone_num; + + if (!zinfo) + return false; + + zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); + if (zone_num + 1 >= zinfo->nr_zones) + return false; + + if (!test_bit(zone_num, zinfo->seq_zones)) + return false; + + return true; +} + +int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + struct blk_zone *zone; + int i; + + if (!is_sb_log_zone(zinfo, mirror)) + return 0; + + zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror]; + for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { + /* Advance the next zone */ + if (zone->cond == BLK_ZONE_COND_FULL) { + zone++; + continue; + } + + if (zone->cond == BLK_ZONE_COND_EMPTY) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + zone->wp += SUPER_INFO_SECTORS; + + if (sb_zone_is_full(zone)) { + /* + * No room left to write new superblock. Since + * superblock is written with REQ_SYNC, it is safe to + * finish the zone now. + * + * If the write pointer is exactly at the capacity, + * explicit ZONE_FINISH is not necessary. + */ + if (zone->wp != zone->start + zone->capacity) { + int ret; + + ret = blkdev_zone_mgmt(device->bdev, + REQ_OP_ZONE_FINISH, zone->start, + zone->len, GFP_NOFS); + if (ret) + return ret; + } + + zone->wp = zone->start + zone->len; + zone->cond = BLK_ZONE_COND_FULL; + } + return 0; + } + + /* All the zones are FULL. Should not reach here. */ + ASSERT(0); + return -EIO; +} + +int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) +{ + sector_t zone_sectors; + sector_t nr_sectors; + u8 zone_sectors_shift; + u32 sb_zone; + u32 nr_zones; + + zone_sectors = bdev_zone_sectors(bdev); + zone_sectors_shift = ilog2(zone_sectors); + nr_sectors = bdev_nr_sectors(bdev); + nr_zones = nr_sectors >> zone_sectors_shift; + + sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); + if (sb_zone + 1 >= nr_zones) + return -ENOENT; + + return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + zone_start_sector(sb_zone, bdev), + zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); +} + +/* + * Find allocatable zones within a given region. + * + * @device: the device to allocate a region on + * @hole_start: the position of the hole to allocate the region + * @num_bytes: size of wanted region + * @hole_end: the end of the hole + * @return: position of allocatable zones + * + * Allocatable region should not contain any superblock locations. + */ +u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, + u64 hole_end, u64 num_bytes) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + const u8 shift = zinfo->zone_size_shift; + u64 nzones = num_bytes >> shift; + u64 pos = hole_start; + u64 begin, end; + bool have_sb; + int i; + + ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); + ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); + + while (pos < hole_end) { + begin = pos >> shift; + end = begin + nzones; + + if (end > zinfo->nr_zones) + return hole_end; + + /* Check if zones in the region are all empty */ + if (btrfs_dev_is_sequential(device, pos) && + !bitmap_test_range_all_set(zinfo->empty_zones, begin, nzones)) { + pos += zinfo->zone_size; + continue; + } + + have_sb = false; + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + u32 sb_zone; + u64 sb_pos; + + sb_zone = sb_zone_number(shift, i); + if (!(end <= sb_zone || + sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { + have_sb = true; + pos = zone_start_physical( + sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo); + break; + } + + /* We also need to exclude regular superblock positions */ + sb_pos = btrfs_sb_offset(i); + if (!(pos + num_bytes <= sb_pos || + sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) { + have_sb = true; + pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE, + zinfo->zone_size); + break; + } + } + if (!have_sb) + break; + } + + return pos; +} + +static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + unsigned int zno = (pos >> zone_info->zone_size_shift); + + /* We can use any number of zones */ + if (zone_info->max_active_zones == 0) + return true; + + if (!test_bit(zno, zone_info->active_zones)) { + /* Active zone left? */ + if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0) + return false; + if (test_and_set_bit(zno, zone_info->active_zones)) { + /* Someone already set the bit */ + atomic_inc(&zone_info->active_zones_left); + } + } + + return true; +} + +static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + unsigned int zno = (pos >> zone_info->zone_size_shift); + + /* We can use any number of zones */ + if (zone_info->max_active_zones == 0) + return; + + if (test_and_clear_bit(zno, zone_info->active_zones)) + atomic_inc(&zone_info->active_zones_left); +} + +int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, + u64 length, u64 *bytes) +{ + int ret; + + *bytes = 0; + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, + physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, + GFP_NOFS); + if (ret) + return ret; + + *bytes = length; + while (length) { + btrfs_dev_set_zone_empty(device, physical); + btrfs_dev_clear_active_zone(device, physical); + physical += device->zone_info->zone_size; + length -= device->zone_info->zone_size; + } + + return 0; +} + +int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + const u8 shift = zinfo->zone_size_shift; + unsigned long begin = start >> shift; + unsigned long nbits = size >> shift; + u64 pos; + int ret; + + ASSERT(IS_ALIGNED(start, zinfo->zone_size)); + ASSERT(IS_ALIGNED(size, zinfo->zone_size)); + + if (begin + nbits > zinfo->nr_zones) + return -ERANGE; + + /* All the zones are conventional */ + if (bitmap_test_range_all_zero(zinfo->seq_zones, begin, nbits)) + return 0; + + /* All the zones are sequential and empty */ + if (bitmap_test_range_all_set(zinfo->seq_zones, begin, nbits) && + bitmap_test_range_all_set(zinfo->empty_zones, begin, nbits)) + return 0; + + for (pos = start; pos < start + size; pos += zinfo->zone_size) { + u64 reset_bytes; + + if (!btrfs_dev_is_sequential(device, pos) || + btrfs_dev_is_empty_zone(device, pos)) + continue; + + /* Free regions should be empty */ + btrfs_warn_in_rcu( + device->fs_info, + "zoned: resetting device %s (devid %llu) zone %llu for allocation", + rcu_str_deref(device->name), device->devid, pos >> shift); + WARN_ON_ONCE(1); + + ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, + &reset_bytes); + if (ret) + return ret; + } + + return 0; +} + +/* + * Calculate an allocation pointer from the extent allocation information + * for a block group consist of conventional zones. It is pointed to the + * end of the highest addressed extent in the block group as an allocation + * offset. + */ +static int calculate_alloc_pointer(struct btrfs_block_group *cache, + u64 *offset_ret, bool new) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + int ret; + u64 length; + + /* + * Avoid tree lookups for a new block group, there's no use for it. + * It must always be 0. + * + * Also, we have a lock chain of extent buffer lock -> chunk mutex. + * For new a block group, this function is called from + * btrfs_make_block_group() which is already taking the chunk mutex. + * Thus, we cannot call calculate_alloc_pointer() which takes extent + * buffer locks to avoid deadlock. + */ + if (new) { + *offset_ret = 0; + return 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = cache->start + cache->length; + key.type = 0; + key.offset = 0; + + root = btrfs_extent_root(fs_info, key.objectid); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + /* We should not find the exact match */ + if (!ret) + ret = -EUCLEAN; + if (ret < 0) + goto out; + + ret = btrfs_previous_extent_item(root, path, cache->start); + if (ret) { + if (ret == 1) { + ret = 0; + *offset_ret = 0; + } + goto out; + } + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + + if (found_key.type == BTRFS_EXTENT_ITEM_KEY) + length = found_key.offset; + else + length = fs_info->nodesize; + + if (!(found_key.objectid >= cache->start && + found_key.objectid + length <= cache->start + cache->length)) { + ret = -EUCLEAN; + goto out; + } + *offset_ret = found_key.objectid + length - cache->start; + ret = 0; + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + struct btrfs_device *device; + u64 logical = cache->start; + u64 length = cache->length; + int ret; + int i; + unsigned int nofs_flag; + u64 *alloc_offsets = NULL; + u64 *caps = NULL; + u64 *physical = NULL; + unsigned long *active = NULL; + u64 last_alloc = 0; + u32 num_sequential = 0, num_conventional = 0; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + /* Sanity check */ + if (!IS_ALIGNED(length, fs_info->zone_size)) { + btrfs_err(fs_info, + "zoned: block group %llu len %llu unaligned to zone size %llu", + logical, length, fs_info->zone_size); + return -EIO; + } + + /* Get the chunk mapping */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, length); + read_unlock(&em_tree->lock); + + if (!em) + return -EINVAL; + + map = em->map_lookup; + + cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS); + if (!cache->physical_map) { + ret = -ENOMEM; + goto out; + } + + alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS); + if (!alloc_offsets) { + ret = -ENOMEM; + goto out; + } + + caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS); + if (!caps) { + ret = -ENOMEM; + goto out; + } + + physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS); + if (!physical) { + ret = -ENOMEM; + goto out; + } + + active = bitmap_zalloc(map->num_stripes, GFP_NOFS); + if (!active) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < map->num_stripes; i++) { + bool is_sequential; + struct blk_zone zone; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int dev_replace_is_ongoing = 0; + + device = map->stripes[i].dev; + physical[i] = map->stripes[i].physical; + + if (device->bdev == NULL) { + alloc_offsets[i] = WP_MISSING_DEV; + continue; + } + + is_sequential = btrfs_dev_is_sequential(device, physical[i]); + if (is_sequential) + num_sequential++; + else + num_conventional++; + + /* + * Consider a zone as active if we can allow any number of + * active zones. + */ + if (!device->zone_info->max_active_zones) + __set_bit(i, active); + + if (!is_sequential) { + alloc_offsets[i] = WP_CONVENTIONAL; + continue; + } + + /* + * This zone will be used for allocation, so mark this zone + * non-empty. + */ + btrfs_dev_clear_zone_empty(device, physical[i]); + + down_read(&dev_replace->rwsem); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) + btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]); + up_read(&dev_replace->rwsem); + + /* + * The group is mapped to a sequential zone. Get the zone write + * pointer to determine the allocation offset within the zone. + */ + WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size)); + nofs_flag = memalloc_nofs_save(); + ret = btrfs_get_dev_zone(device, physical[i], &zone); + memalloc_nofs_restore(nofs_flag); + if (ret == -EIO || ret == -EOPNOTSUPP) { + ret = 0; + alloc_offsets[i] = WP_MISSING_DEV; + continue; + } else if (ret) { + goto out; + } + + if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { + btrfs_err_in_rcu(fs_info, + "zoned: unexpected conventional zone %llu on device %s (devid %llu)", + zone.start << SECTOR_SHIFT, + rcu_str_deref(device->name), device->devid); + ret = -EIO; + goto out; + } + + caps[i] = (zone.capacity << SECTOR_SHIFT); + + switch (zone.cond) { + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + btrfs_err(fs_info, + "zoned: offline/readonly zone %llu on device %s (devid %llu)", + physical[i] >> device->zone_info->zone_size_shift, + rcu_str_deref(device->name), device->devid); + alloc_offsets[i] = WP_MISSING_DEV; + break; + case BLK_ZONE_COND_EMPTY: + alloc_offsets[i] = 0; + break; + case BLK_ZONE_COND_FULL: + alloc_offsets[i] = caps[i]; + break; + default: + /* Partially used zone */ + alloc_offsets[i] = + ((zone.wp - zone.start) << SECTOR_SHIFT); + __set_bit(i, active); + break; + } + } + + if (num_sequential > 0) + set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); + + if (num_conventional > 0) { + /* Zone capacity is always zone size in emulation */ + cache->zone_capacity = cache->length; + ret = calculate_alloc_pointer(cache, &last_alloc, new); + if (ret) { + btrfs_err(fs_info, + "zoned: failed to determine allocation offset of bg %llu", + cache->start); + goto out; + } else if (map->num_stripes == num_conventional) { + cache->alloc_offset = last_alloc; + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); + goto out; + } + } + + switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + case 0: /* single */ + if (alloc_offsets[0] == WP_MISSING_DEV) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer for zone %llu", + physical[0]); + ret = -EIO; + goto out; + } + cache->alloc_offset = alloc_offsets[0]; + cache->zone_capacity = caps[0]; + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); + break; + case BTRFS_BLOCK_GROUP_DUP: + if (map->type & BTRFS_BLOCK_GROUP_DATA) { + btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg"); + ret = -EINVAL; + goto out; + } + if (alloc_offsets[0] == WP_MISSING_DEV) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer for zone %llu", + physical[0]); + ret = -EIO; + goto out; + } + if (alloc_offsets[1] == WP_MISSING_DEV) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer for zone %llu", + physical[1]); + ret = -EIO; + goto out; + } + if (alloc_offsets[0] != alloc_offsets[1]) { + btrfs_err(fs_info, + "zoned: write pointer offset mismatch of zones in DUP profile"); + ret = -EIO; + goto out; + } + if (test_bit(0, active) != test_bit(1, active)) { + if (!btrfs_zone_activate(cache)) { + ret = -EIO; + goto out; + } + } else { + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &cache->runtime_flags); + } + cache->alloc_offset = alloc_offsets[0]; + cache->zone_capacity = min(caps[0], caps[1]); + break; + case BTRFS_BLOCK_GROUP_RAID1: + case BTRFS_BLOCK_GROUP_RAID0: + case BTRFS_BLOCK_GROUP_RAID10: + case BTRFS_BLOCK_GROUP_RAID5: + case BTRFS_BLOCK_GROUP_RAID6: + /* non-single profiles are not supported yet */ + default: + btrfs_err(fs_info, "zoned: profile %s not yet supported", + btrfs_bg_type_to_raid_name(map->type)); + ret = -EINVAL; + goto out; + } + +out: + if (cache->alloc_offset > fs_info->zone_size) { + btrfs_err(fs_info, + "zoned: invalid write pointer %llu in block group %llu", + cache->alloc_offset, cache->start); + ret = -EIO; + } + + if (cache->alloc_offset > cache->zone_capacity) { + btrfs_err(fs_info, +"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", + cache->alloc_offset, cache->zone_capacity, + cache->start); + ret = -EIO; + } + + /* An extent is allocated after the write pointer */ + if (!ret && num_conventional && last_alloc > cache->alloc_offset) { + btrfs_err(fs_info, + "zoned: got wrong write pointer in BG %llu: %llu > %llu", + logical, last_alloc, cache->alloc_offset); + ret = -EIO; + } + + if (!ret) { + cache->meta_write_pointer = cache->alloc_offset + cache->start; + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) { + btrfs_get_block_group(cache); + spin_lock(&fs_info->zone_active_bgs_lock); + list_add_tail(&cache->active_bg_list, + &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); + } + } else { + kfree(cache->physical_map); + cache->physical_map = NULL; + } + bitmap_free(active); + kfree(physical); + kfree(caps); + kfree(alloc_offsets); + free_extent_map(em); + + return ret; +} + +void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) +{ + u64 unusable, free; + + if (!btrfs_is_zoned(cache->fs_info)) + return; + + WARN_ON(cache->bytes_super != 0); + unusable = (cache->alloc_offset - cache->used) + + (cache->length - cache->zone_capacity); + free = cache->zone_capacity - cache->alloc_offset; + + /* We only need ->free_space in ALLOC_SEQ block groups */ + cache->cached = BTRFS_CACHE_FINISHED; + cache->free_space_ctl->free_space = free; + cache->zone_unusable = unusable; +} + +void btrfs_redirty_list_add(struct btrfs_transaction *trans, + struct extent_buffer *eb) +{ + if (!btrfs_is_zoned(eb->fs_info) || + btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)) + return; + + ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + + memzero_extent_buffer(eb, 0, eb->len); + set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); + set_extent_buffer_dirty(eb); + set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1, + EXTENT_DIRTY | EXTENT_NOWAIT, NULL); +} + +bool btrfs_use_zone_append(struct btrfs_bio *bbio) +{ + u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_block_group *cache; + bool ret = false; + + if (!btrfs_is_zoned(fs_info)) + return false; + + if (!inode || !is_data_inode(&inode->vfs_inode)) + return false; + + if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) + return false; + + /* + * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the + * extent layout the relocation code has. + * Furthermore we have set aside own block-group from which only the + * relocation "process" can allocate and make sure only one process at a + * time can add pages to an extent that gets relocated, so it's safe to + * use regular REQ_OP_WRITE for this special case. + */ + if (btrfs_is_data_reloc_root(inode->root)) + return false; + + cache = btrfs_lookup_block_group(fs_info, start); + ASSERT(cache); + if (!cache) + return false; + + ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); + btrfs_put_block_group(cache); + + return ret; +} + +void btrfs_record_physical_zoned(struct btrfs_bio *bbio) +{ + const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + struct btrfs_ordered_sum *sum = bbio->sums; + + if (physical < bbio->orig_physical) + sum->logical -= bbio->orig_physical - physical; + else + sum->logical += physical - bbio->orig_physical; +} + +static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, + u64 logical) +{ + struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree; + struct extent_map *em; + + ordered->disk_bytenr = logical; + + write_lock(&em_tree->lock); + em = search_extent_mapping(em_tree, ordered->file_offset, + ordered->num_bytes); + em->block_start = logical; + free_extent_map(em); + write_unlock(&em_tree->lock); +} + +static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered, + u64 logical, u64 len) +{ + struct btrfs_ordered_extent *new; + + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && + split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset, + ordered->num_bytes, len, logical)) + return false; + + new = btrfs_split_ordered_extent(ordered, len); + if (IS_ERR(new)) + return false; + new->disk_bytenr = logical; + btrfs_finish_one_ordered(new); + return true; +} + +void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_ordered_sum *sum; + u64 logical, len; + + /* + * Write to pre-allocated region is for the data relocation, and so + * it should use WRITE operation. No split/rewrite are necessary. + */ + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) + return; + + ASSERT(!list_empty(&ordered->list)); + /* The ordered->list can be empty in the above pre-alloc case. */ + sum = list_first_entry(&ordered->list, struct btrfs_ordered_sum, list); + logical = sum->logical; + len = sum->len; + + while (len < ordered->disk_num_bytes) { + sum = list_next_entry(sum, list); + if (sum->logical == logical + len) { + len += sum->len; + continue; + } + if (!btrfs_zoned_split_ordered(ordered, logical, len)) { + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + btrfs_err(fs_info, "failed to split ordered extent"); + goto out; + } + logical = sum->logical; + len = sum->len; + } + + if (ordered->disk_bytenr != logical) + btrfs_rewrite_logical_zoned(ordered, logical); + +out: + /* + * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures + * were allocated by btrfs_alloc_dummy_sum only to record the logical + * addresses and don't contain actual checksums. We thus must free them + * here so that we don't attempt to log the csums later. + */ + if ((inode->flags & BTRFS_INODE_NODATASUM) || + test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) { + while ((sum = list_first_entry_or_null(&ordered->list, + typeof(*sum), list))) { + list_del(&sum->list); + kfree(sum); + } + } +} + +static bool check_bg_is_active(struct btrfs_eb_write_context *ctx, + struct btrfs_block_group **active_bg) +{ + const struct writeback_control *wbc = ctx->wbc; + struct btrfs_block_group *block_group = ctx->zoned_bg; + struct btrfs_fs_info *fs_info = block_group->fs_info; + + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) + return true; + + if (fs_info->treelog_bg == block_group->start) { + if (!btrfs_zone_activate(block_group)) { + int ret_fin = btrfs_zone_finish_one_bg(fs_info); + + if (ret_fin != 1 || !btrfs_zone_activate(block_group)) + return false; + } + } else if (*active_bg != block_group) { + struct btrfs_block_group *tgt = *active_bg; + + /* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */ + lockdep_assert_held(&fs_info->zoned_meta_io_lock); + + if (tgt) { + /* + * If there is an unsent IO left in the allocated area, + * we cannot wait for them as it may cause a deadlock. + */ + if (tgt->meta_write_pointer < tgt->start + tgt->alloc_offset) { + if (wbc->sync_mode == WB_SYNC_NONE || + (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)) + return false; + } + + /* Pivot active metadata/system block group. */ + btrfs_zoned_meta_io_unlock(fs_info); + wait_eb_writebacks(tgt); + do_zone_finish(tgt, true); + btrfs_zoned_meta_io_lock(fs_info); + if (*active_bg == tgt) { + btrfs_put_block_group(tgt); + *active_bg = NULL; + } + } + if (!btrfs_zone_activate(block_group)) + return false; + if (*active_bg != block_group) { + ASSERT(*active_bg == NULL); + *active_bg = block_group; + btrfs_get_block_group(block_group); + } + } + + return true; +} + +/* + * Check if @ctx->eb is aligned to the write pointer. + * + * Return: + * 0: @ctx->eb is at the write pointer. You can write it. + * -EAGAIN: There is a hole. The caller should handle the case. + * -EBUSY: There is a hole, but the caller can just bail out. + */ +int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct btrfs_eb_write_context *ctx) +{ + const struct writeback_control *wbc = ctx->wbc; + const struct extent_buffer *eb = ctx->eb; + struct btrfs_block_group *block_group = ctx->zoned_bg; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + if (block_group) { + if (block_group->start > eb->start || + block_group->start + block_group->length <= eb->start) { + btrfs_put_block_group(block_group); + block_group = NULL; + ctx->zoned_bg = NULL; + } + } + + if (!block_group) { + block_group = btrfs_lookup_block_group(fs_info, eb->start); + if (!block_group) + return 0; + ctx->zoned_bg = block_group; + } + + if (block_group->meta_write_pointer == eb->start) { + struct btrfs_block_group **tgt; + + if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) + return 0; + + if (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) + tgt = &fs_info->active_system_bg; + else + tgt = &fs_info->active_meta_bg; + if (check_bg_is_active(ctx, tgt)) + return 0; + } + + /* + * Since we may release fs_info->zoned_meta_io_lock, someone can already + * start writing this eb. In that case, we can just bail out. + */ + if (block_group->meta_write_pointer > eb->start) + return -EBUSY; + + /* If for_sync, this hole will be filled with trasnsaction commit. */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) + return -EAGAIN; + return -EBUSY; +} + +int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) +{ + if (!btrfs_dev_is_sequential(device, physical)) + return -EOPNOTSUPP; + + return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT, + length >> SECTOR_SHIFT, GFP_NOFS, 0); +} + +static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, + struct blk_zone *zone) +{ + struct btrfs_io_context *bioc = NULL; + u64 mapped_length = PAGE_SIZE; + unsigned int nofs_flag; + int nmirrors; + int i, ret; + + ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, + &mapped_length, &bioc, NULL, NULL, 1); + if (ret || !bioc || mapped_length < PAGE_SIZE) { + ret = -EIO; + goto out_put_bioc; + } + + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = -EINVAL; + goto out_put_bioc; + } + + nofs_flag = memalloc_nofs_save(); + nmirrors = (int)bioc->num_stripes; + for (i = 0; i < nmirrors; i++) { + u64 physical = bioc->stripes[i].physical; + struct btrfs_device *dev = bioc->stripes[i].dev; + + /* Missing device */ + if (!dev->bdev) + continue; + + ret = btrfs_get_dev_zone(dev, physical, zone); + /* Failing device */ + if (ret == -EIO || ret == -EOPNOTSUPP) + continue; + break; + } + memalloc_nofs_restore(nofs_flag); +out_put_bioc: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by + * filling zeros between @physical_pos to a write pointer of dev-replace + * source device. + */ +int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, + u64 physical_start, u64 physical_pos) +{ + struct btrfs_fs_info *fs_info = tgt_dev->fs_info; + struct blk_zone zone; + u64 length; + u64 wp; + int ret; + + if (!btrfs_dev_is_sequential(tgt_dev, physical_pos)) + return 0; + + ret = read_zone_info(fs_info, logical, &zone); + if (ret) + return ret; + + wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT); + + if (physical_pos == wp) + return 0; + + if (physical_pos > wp) + return -EUCLEAN; + + length = wp - physical_pos; + return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); +} + +/* + * Activate block group and underlying device zones + * + * @block_group: the block group to activate + * + * Return: true on success, false otherwise + */ +bool btrfs_zone_activate(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct map_lookup *map; + struct btrfs_device *device; + u64 physical; + const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA); + bool ret; + int i; + + if (!btrfs_is_zoned(block_group->fs_info)) + return true; + + map = block_group->physical_map; + + spin_lock(&fs_info->zone_active_bgs_lock); + spin_lock(&block_group->lock); + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { + ret = true; + goto out_unlock; + } + + /* No space left */ + if (btrfs_zoned_bg_is_full(block_group)) { + ret = false; + goto out_unlock; + } + + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_zoned_device_info *zinfo; + int reserved = 0; + + device = map->stripes[i].dev; + physical = map->stripes[i].physical; + zinfo = device->zone_info; + + if (zinfo->max_active_zones == 0) + continue; + + if (is_data) + reserved = zinfo->reserved_active_zones; + /* + * For the data block group, leave active zones for one + * metadata block group and one system block group. + */ + if (atomic_read(&zinfo->active_zones_left) <= reserved) { + ret = false; + goto out_unlock; + } + + if (!btrfs_dev_set_active_zone(device, physical)) { + /* Cannot activate the zone */ + ret = false; + goto out_unlock; + } + if (!is_data) + zinfo->reserved_active_zones--; + } + + /* Successfully activated all the zones */ + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); + spin_unlock(&block_group->lock); + + /* For the active block group list */ + btrfs_get_block_group(block_group); + list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); + + return true; + +out_unlock: + spin_unlock(&block_group->lock); + spin_unlock(&fs_info->zone_active_bgs_lock); + return ret; +} + +static void wait_eb_writebacks(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + const u64 end = block_group->start + block_group->length; + struct radix_tree_iter iter; + struct extent_buffer *eb; + void __rcu **slot; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, + block_group->start >> fs_info->sectorsize_bits) { + eb = radix_tree_deref_slot(slot); + if (!eb) + continue; + if (radix_tree_deref_retry(eb)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + if (eb->start < block_group->start) + continue; + if (eb->start >= end) + break; + + slot = radix_tree_iter_resume(slot, &iter); + rcu_read_unlock(); + wait_on_extent_buffer_writeback(eb); + rcu_read_lock(); + } + rcu_read_unlock(); +} + +static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct map_lookup *map; + const bool is_metadata = (block_group->flags & + (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); + int ret = 0; + int i; + + spin_lock(&block_group->lock); + if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { + spin_unlock(&block_group->lock); + return 0; + } + + /* Check if we have unwritten allocated space */ + if (is_metadata && + block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { + spin_unlock(&block_group->lock); + return -EAGAIN; + } + + /* + * If we are sure that the block group is full (= no more room left for + * new allocation) and the IO for the last usable block is completed, we + * don't need to wait for the other IOs. This holds because we ensure + * the sequential IO submissions using the ZONE_APPEND command for data + * and block_group->meta_write_pointer for metadata. + */ + if (!fully_written) { + if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { + spin_unlock(&block_group->lock); + return -EAGAIN; + } + spin_unlock(&block_group->lock); + + ret = btrfs_inc_block_group_ro(block_group, false); + if (ret) + return ret; + + /* Ensure all writes in this block group finish */ + btrfs_wait_block_group_reservations(block_group); + /* No need to wait for NOCOW writers. Zoned mode does not allow that */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, + block_group->length); + /* Wait for extent buffers to be written. */ + if (is_metadata) + wait_eb_writebacks(block_group); + + spin_lock(&block_group->lock); + + /* + * Bail out if someone already deactivated the block group, or + * allocated space is left in the block group. + */ + if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &block_group->runtime_flags)) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return 0; + } + + if (block_group->reserved || + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, + &block_group->runtime_flags)) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return -EAGAIN; + } + } + + clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); + block_group->alloc_offset = block_group->zone_capacity; + if (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) + block_group->meta_write_pointer = block_group->start + + block_group->zone_capacity; + block_group->free_space_ctl->free_space = 0; + btrfs_clear_treelog_bg(block_group); + btrfs_clear_data_reloc_bg(block_group); + spin_unlock(&block_group->lock); + + map = block_group->physical_map; + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *device = map->stripes[i].dev; + const u64 physical = map->stripes[i].physical; + struct btrfs_zoned_device_info *zinfo = device->zone_info; + + if (zinfo->max_active_zones == 0) + continue; + + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, + physical >> SECTOR_SHIFT, + zinfo->zone_size >> SECTOR_SHIFT, + GFP_NOFS); + + if (ret) + return ret; + + if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + zinfo->reserved_active_zones++; + btrfs_dev_clear_active_zone(device, physical); + } + + if (!fully_written) + btrfs_dec_block_group_ro(block_group); + + spin_lock(&fs_info->zone_active_bgs_lock); + ASSERT(!list_empty(&block_group->active_bg_list)); + list_del_init(&block_group->active_bg_list); + spin_unlock(&fs_info->zone_active_bgs_lock); + + /* For active_bg_list */ + btrfs_put_block_group(block_group); + + clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); + + return 0; +} + +int btrfs_zone_finish(struct btrfs_block_group *block_group) +{ + if (!btrfs_is_zoned(block_group->fs_info)) + return 0; + + return do_zone_finish(block_group, false); +} + +bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) +{ + struct btrfs_fs_info *fs_info = fs_devices->fs_info; + struct btrfs_device *device; + bool ret = false; + + if (!btrfs_is_zoned(fs_info)) + return true; + + /* Check if there is a device with active zones left */ + mutex_lock(&fs_info->chunk_mutex); + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { + struct btrfs_zoned_device_info *zinfo = device->zone_info; + int reserved = 0; + + if (!device->bdev) + continue; + + if (!zinfo->max_active_zones) { + ret = true; + break; + } + + if (flags & BTRFS_BLOCK_GROUP_DATA) + reserved = zinfo->reserved_active_zones; + + switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + case 0: /* single */ + ret = (atomic_read(&zinfo->active_zones_left) >= (1 + reserved)); + break; + case BTRFS_BLOCK_GROUP_DUP: + ret = (atomic_read(&zinfo->active_zones_left) >= (2 + reserved)); + break; + } + if (ret) + break; + } + spin_unlock(&fs_info->zone_active_bgs_lock); + mutex_unlock(&fs_info->chunk_mutex); + + if (!ret) + set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); + + return ret; +} + +void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) +{ + struct btrfs_block_group *block_group; + u64 min_alloc_bytes; + + if (!btrfs_is_zoned(fs_info)) + return; + + block_group = btrfs_lookup_block_group(fs_info, logical); + ASSERT(block_group); + + /* No MIXED_BG on zoned btrfs. */ + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) + min_alloc_bytes = fs_info->sectorsize; + else + min_alloc_bytes = fs_info->nodesize; + + /* Bail out if we can allocate more data from this block group. */ + if (logical + length + min_alloc_bytes <= + block_group->start + block_group->zone_capacity) + goto out; + + do_zone_finish(block_group, true); + +out: + btrfs_put_block_group(block_group); +} + +static void btrfs_zone_finish_endio_workfn(struct work_struct *work) +{ + struct btrfs_block_group *bg = + container_of(work, struct btrfs_block_group, zone_finish_work); + + wait_on_extent_buffer_writeback(bg->last_eb); + free_extent_buffer(bg->last_eb); + btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length); + btrfs_put_block_group(bg); +} + +void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb) +{ + if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) || + eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) + return; + + if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { + btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing", + bg->start); + return; + } + + /* For the work */ + btrfs_get_block_group(bg); + atomic_inc(&eb->refs); + bg->last_eb = eb; + INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); + queue_work(system_unbound_wq, &bg->zone_finish_work); +} + +void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg == bg->start) + fs_info->data_reloc_bg = 0; + spin_unlock(&fs_info->relocation_bg_lock); +} + +void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + + if (!btrfs_is_zoned(fs_info)) + return; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (device->zone_info) { + vfree(device->zone_info->zone_cache); + device->zone_info->zone_cache = NULL; + } + } + mutex_unlock(&fs_devices->device_list_mutex); +} + +bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 used = 0; + u64 total = 0; + u64 factor; + + ASSERT(btrfs_is_zoned(fs_info)); + + if (fs_info->bg_reclaim_threshold == 0) + return false; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + + total += device->disk_total_bytes; + used += device->bytes_used; + } + mutex_unlock(&fs_devices->device_list_mutex); + + factor = div64_u64(used * 100, total); + return factor >= fs_info->bg_reclaim_threshold; +} + +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length) +{ + struct btrfs_block_group *block_group; + + if (!btrfs_is_zoned(fs_info)) + return; + + block_group = btrfs_lookup_block_group(fs_info, logical); + /* It should be called on a previous data relocation block group. */ + ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); + + spin_lock(&block_group->lock); + if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) + goto out; + + /* All relocation extents are written. */ + if (block_group->start + block_group->alloc_offset == logical + length) { + /* + * Now, release this block group for further allocations and + * zone finish. + */ + clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, + &block_group->runtime_flags); + } + +out: + spin_unlock(&block_group->lock); + btrfs_put_block_group(block_group); +} + +int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group *block_group; + struct btrfs_block_group *min_bg = NULL; + u64 min_avail = U64_MAX; + int ret; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, + active_bg_list) { + u64 avail; + + spin_lock(&block_group->lock); + if (block_group->reserved || block_group->alloc_offset == 0 || + (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) || + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { + spin_unlock(&block_group->lock); + continue; + } + + avail = block_group->zone_capacity - block_group->alloc_offset; + if (min_avail > avail) { + if (min_bg) + btrfs_put_block_group(min_bg); + min_bg = block_group; + min_avail = avail; + btrfs_get_block_group(min_bg); + } + spin_unlock(&block_group->lock); + } + spin_unlock(&fs_info->zone_active_bgs_lock); + + if (!min_bg) + return 0; + + ret = btrfs_zone_finish(min_bg); + btrfs_put_block_group(min_bg); + + return ret < 0 ? ret : 1; +} + +int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool do_finish) +{ + struct btrfs_block_group *bg; + int index; + + if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) + return 0; + + for (;;) { + int ret; + bool need_finish = false; + + down_read(&space_info->groups_sem); + for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) { + list_for_each_entry(bg, &space_info->block_groups[index], + list) { + if (!spin_trylock(&bg->lock)) + continue; + if (btrfs_zoned_bg_is_full(bg) || + test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &bg->runtime_flags)) { + spin_unlock(&bg->lock); + continue; + } + spin_unlock(&bg->lock); + + if (btrfs_zone_activate(bg)) { + up_read(&space_info->groups_sem); + return 1; + } + + need_finish = true; + } + } + up_read(&space_info->groups_sem); + + if (!do_finish || !need_finish) + break; + + ret = btrfs_zone_finish_one_bg(fs_info); + if (ret == 0) + break; + if (ret < 0) + return ret; + } + + return 0; +} + +/* + * Reserve zones for one metadata block group, one tree-log block group, and one + * system block group. + */ +void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_block_group *block_group; + struct btrfs_device *device; + /* Reserve zones for normal SINGLE metadata and tree-log block group. */ + unsigned int metadata_reserve = 2; + /* Reserve a zone for SINGLE system block group. */ + unsigned int system_reserve = 1; + + if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) + return; + + /* + * This function is called from the mount context. So, there is no + * parallel process touching the bits. No need for read_seqretry(). + */ + if (fs_info->avail_metadata_alloc_bits & BTRFS_BLOCK_GROUP_DUP) + metadata_reserve = 4; + if (fs_info->avail_system_alloc_bits & BTRFS_BLOCK_GROUP_DUP) + system_reserve = 2; + + /* Apply the reservation on all the devices. */ + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + + device->zone_info->reserved_active_zones = + metadata_reserve + system_reserve; + } + mutex_unlock(&fs_devices->device_list_mutex); + + /* Release reservation for currently active block groups. */ + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { + struct map_lookup *map = block_group->physical_map; + + if (!(block_group->flags & + (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) + continue; + + for (int i = 0; i < map->num_stripes; i++) + map->stripes[i].dev->zone_info->reserved_active_zones--; + } + spin_unlock(&fs_info->zone_active_bgs_lock); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h new file mode 100644 index 0000000000..b9cec523b7 --- /dev/null +++ b/fs/btrfs/zoned.h @@ -0,0 +1,403 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ZONED_H +#define BTRFS_ZONED_H + +#include +#include +#include "messages.h" +#include "volumes.h" +#include "disk-io.h" +#include "block-group.h" +#include "btrfs_inode.h" + +#define BTRFS_DEFAULT_RECLAIM_THRESH (75) + +struct btrfs_zoned_device_info { + /* + * Number of zones, zone size and types of zones if bdev is a + * zoned block device. + */ + u64 zone_size; + u8 zone_size_shift; + u32 nr_zones; + unsigned int max_active_zones; + /* + * Reserved active zones for one metadata and one system block group. + * It can vary per-device depending on the allocation status. + */ + int reserved_active_zones; + atomic_t active_zones_left; + unsigned long *seq_zones; + unsigned long *empty_zones; + unsigned long *active_zones; + struct blk_zone *zone_cache; + struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX]; +}; + +void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered); + +#ifdef CONFIG_BLK_DEV_ZONED +int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, + struct blk_zone *zone); +int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info); +int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache); +void btrfs_destroy_dev_zone_info(struct btrfs_device *device); +struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev); +int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); +int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info); +int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, + u64 *bytenr_ret); +int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, + u64 *bytenr_ret); +int btrfs_advance_sb_log(struct btrfs_device *device, int mirror); +int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror); +u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, + u64 hole_end, u64 num_bytes); +int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, + u64 length, u64 *bytes); +int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new); +void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); +void btrfs_redirty_list_add(struct btrfs_transaction *trans, + struct extent_buffer *eb); +bool btrfs_use_zone_append(struct btrfs_bio *bbio); +void btrfs_record_physical_zoned(struct btrfs_bio *bbio); +int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct btrfs_eb_write_context *ctx); +int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); +int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, + u64 physical_start, u64 physical_pos); +bool btrfs_zone_activate(struct btrfs_block_group *block_group); +int btrfs_zone_finish(struct btrfs_block_group *block_group); +bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); +void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); +void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb); +void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); +void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); +bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); +int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); +int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, bool do_finish); +void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); +#else /* CONFIG_BLK_DEV_ZONED */ +static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, + struct blk_zone *zone) +{ + return 0; +} + +static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) +{ + return 0; +} + +static inline int btrfs_get_dev_zone_info(struct btrfs_device *device, + bool populate_cache) +{ + return 0; +} + +static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { } + +/* + * In case the kernel is compiled without CONFIG_BLK_DEV_ZONED we'll never call + * into btrfs_clone_dev_zone_info() so it's safe to return NULL here. + */ +static inline struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info( + struct btrfs_device *orig_dev) +{ + return NULL; +} + +static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info) +{ + if (!btrfs_is_zoned(fs_info)) + return 0; + + btrfs_err(fs_info, "zoned block devices support is not enabled"); + return -EOPNOTSUPP; +} + +static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) +{ + return 0; +} + +static inline int btrfs_sb_log_location_bdev(struct block_device *bdev, + int mirror, int rw, u64 *bytenr_ret) +{ + *bytenr_ret = btrfs_sb_offset(mirror); + return 0; +} + +static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror, + int rw, u64 *bytenr_ret) +{ + *bytenr_ret = btrfs_sb_offset(mirror); + return 0; +} + +static inline int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) +{ + return 0; +} + +static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) +{ + return 0; +} + +static inline u64 btrfs_find_allocatable_zones(struct btrfs_device *device, + u64 hole_start, u64 hole_end, + u64 num_bytes) +{ + return hole_start; +} + +static inline int btrfs_reset_device_zone(struct btrfs_device *device, + u64 physical, u64 length, u64 *bytes) +{ + *bytes = 0; + return 0; +} + +static inline int btrfs_ensure_empty_zones(struct btrfs_device *device, + u64 start, u64 size) +{ + return 0; +} + +static inline int btrfs_load_block_group_zone_info( + struct btrfs_block_group *cache, bool new) +{ + return 0; +} + +static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { } + +static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, + struct extent_buffer *eb) { } + +static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio) +{ + return false; +} + +static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio) +{ +} + +static inline int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct btrfs_eb_write_context *ctx) +{ + return 0; +} + +static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device, + u64 physical, u64 length) +{ + return -EOPNOTSUPP; +} + +static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, + u64 logical, u64 physical_start, + u64 physical_pos) +{ + return -EOPNOTSUPP; +} + +static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group) +{ + return true; +} + +static inline int btrfs_zone_finish(struct btrfs_block_group *block_group) +{ + return 0; +} + +static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, + u64 flags) +{ + return true; +} + +static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { } + +static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb) { } + +static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } + +static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } + +static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +{ + return false; +} + +static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { } + +static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) +{ + return 1; +} + +static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool do_finish) +{ + /* Consider all the block groups are active */ + return 0; +} + +static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { } + +#endif + +static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + + if (!zone_info) + return false; + + return test_bit(pos >> zone_info->zone_size_shift, zone_info->seq_zones); +} + +static inline bool btrfs_dev_is_empty_zone(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + + if (!zone_info) + return true; + + return test_bit(pos >> zone_info->zone_size_shift, zone_info->empty_zones); +} + +static inline void btrfs_dev_set_empty_zone_bit(struct btrfs_device *device, + u64 pos, bool set) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + unsigned int zno; + + if (!zone_info) + return; + + zno = pos >> zone_info->zone_size_shift; + if (set) + set_bit(zno, zone_info->empty_zones); + else + clear_bit(zno, zone_info->empty_zones); +} + +static inline void btrfs_dev_set_zone_empty(struct btrfs_device *device, u64 pos) +{ + btrfs_dev_set_empty_zone_bit(device, pos, true); +} + +static inline void btrfs_dev_clear_zone_empty(struct btrfs_device *device, u64 pos) +{ + btrfs_dev_set_empty_zone_bit(device, pos, false); +} + +static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_info, + struct block_device *bdev) +{ + if (btrfs_is_zoned(fs_info)) { + /* + * We can allow a regular device on a zoned filesystem, because + * we will emulate the zoned capabilities. + */ + if (!bdev_is_zoned(bdev)) + return true; + + return fs_info->zone_size == + (bdev_zone_sectors(bdev) << SECTOR_SHIFT); + } + + /* Do not allow Host Manged zoned device */ + return bdev_zoned_model(bdev) != BLK_ZONED_HM; +} + +static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos) +{ + /* + * On a non-zoned device, any address is OK. On a zoned device, + * non-SEQUENTIAL WRITE REQUIRED zones are capable. + */ + return device->zone_info == NULL || !btrfs_dev_is_sequential(device, pos); +} + +static inline bool btrfs_can_zone_reset(struct btrfs_device *device, + u64 physical, u64 length) +{ + u64 zone_size; + + if (!btrfs_dev_is_sequential(device, physical)) + return false; + + zone_size = device->zone_info->zone_size; + if (!IS_ALIGNED(physical, zone_size) || !IS_ALIGNED(length, zone_size)) + return false; + + return true; +} + +static inline void btrfs_zoned_meta_io_lock(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_is_zoned(fs_info)) + return; + mutex_lock(&fs_info->zoned_meta_io_lock); +} + +static inline void btrfs_zoned_meta_io_unlock(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_is_zoned(fs_info)) + return; + mutex_unlock(&fs_info->zoned_meta_io_lock); +} + +static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + if (!btrfs_is_zoned(fs_info)) + return; + + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg == bg->start) + fs_info->treelog_bg = 0; + spin_unlock(&fs_info->treelog_bg_lock); +} + +static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + + if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) + mutex_lock(&root->fs_info->zoned_data_reloc_io_lock); +} + +static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + + if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) + mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock); +} + +static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg) +{ + ASSERT(btrfs_is_zoned(bg->fs_info)); + return (bg->alloc_offset == bg->zone_capacity); +} + +#endif diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c new file mode 100644 index 0000000000..e7ac4ec809 --- /dev/null +++ b/fs/btrfs/zstd.c @@ -0,0 +1,702 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016-present, Facebook, Inc. + * All rights reserved. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "misc.h" +#include "compression.h" +#include "ctree.h" + +#define ZSTD_BTRFS_MAX_WINDOWLOG 17 +#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) +#define ZSTD_BTRFS_DEFAULT_LEVEL 3 +#define ZSTD_BTRFS_MAX_LEVEL 15 +/* 307s to avoid pathologically clashing with transaction commit */ +#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ) + +static zstd_parameters zstd_get_btrfs_parameters(unsigned int level, + size_t src_len) +{ + zstd_parameters params = zstd_get_params(level, src_len); + + if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG) + params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG; + WARN_ON(src_len > ZSTD_BTRFS_MAX_INPUT); + return params; +} + +struct workspace { + void *mem; + size_t size; + char *buf; + unsigned int level; + unsigned int req_level; + unsigned long last_used; /* jiffies */ + struct list_head list; + struct list_head lru_list; + zstd_in_buffer in_buf; + zstd_out_buffer out_buf; +}; + +/* + * Zstd Workspace Management + * + * Zstd workspaces have different memory requirements depending on the level. + * The zstd workspaces are managed by having individual lists for each level + * and a global lru. Forward progress is maintained by protecting a max level + * workspace. + * + * Getting a workspace is done by using the bitmap to identify the levels that + * have available workspaces and scans up. This lets us recycle higher level + * workspaces because of the monotonic memory guarantee. A workspace's + * last_used is only updated if it is being used by the corresponding memory + * level. Putting a workspace involves adding it back to the appropriate places + * and adding it back to the lru if necessary. + * + * A timer is used to reclaim workspaces if they have not been used for + * ZSTD_BTRFS_RECLAIM_JIFFIES. This helps keep only active workspaces around. + * The upper bound is provided by the workqueue limit which is 2 (percpu limit). + */ + +struct zstd_workspace_manager { + const struct btrfs_compress_op *ops; + spinlock_t lock; + struct list_head lru_list; + struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL]; + unsigned long active_map; + wait_queue_head_t wait; + struct timer_list timer; +}; + +static struct zstd_workspace_manager wsm; + +static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL]; + +static inline struct workspace *list_to_workspace(struct list_head *list) +{ + return container_of(list, struct workspace, list); +} + +void zstd_free_workspace(struct list_head *ws); +struct list_head *zstd_alloc_workspace(unsigned int level); + +/* + * Timer callback to free unused workspaces. + * + * @t: timer + * + * This scans the lru_list and attempts to reclaim any workspace that hasn't + * been used for ZSTD_BTRFS_RECLAIM_JIFFIES. + * + * The context is softirq and does not need the _bh locking primitives. + */ +static void zstd_reclaim_timer_fn(struct timer_list *timer) +{ + unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; + struct list_head *pos, *next; + + spin_lock(&wsm.lock); + + if (list_empty(&wsm.lru_list)) { + spin_unlock(&wsm.lock); + return; + } + + list_for_each_prev_safe(pos, next, &wsm.lru_list) { + struct workspace *victim = container_of(pos, struct workspace, + lru_list); + unsigned int level; + + if (time_after(victim->last_used, reclaim_threshold)) + break; + + /* workspace is in use */ + if (victim->req_level) + continue; + + level = victim->level; + list_del(&victim->lru_list); + list_del(&victim->list); + zstd_free_workspace(&victim->list); + + if (list_empty(&wsm.idle_ws[level - 1])) + clear_bit(level - 1, &wsm.active_map); + + } + + if (!list_empty(&wsm.lru_list)) + mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); + + spin_unlock(&wsm.lock); +} + +/* + * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds + * + * It is possible based on the level configurations that a higher level + * workspace uses less memory than a lower level workspace. In order to reuse + * workspaces, this must be made a monotonic relationship. This precomputes + * the required memory for each level and enforces the monotonicity between + * level and memory required. + */ +static void zstd_calc_ws_mem_sizes(void) +{ + size_t max_size = 0; + unsigned int level; + + for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { + zstd_parameters params = + zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT); + size_t level_size = + max_t(size_t, + zstd_cstream_workspace_bound(¶ms.cParams), + zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT)); + + max_size = max_t(size_t, max_size, level_size); + zstd_ws_mem_sizes[level - 1] = max_size; + } +} + +void zstd_init_workspace_manager(void) +{ + struct list_head *ws; + int i; + + zstd_calc_ws_mem_sizes(); + + wsm.ops = &btrfs_zstd_compress; + spin_lock_init(&wsm.lock); + init_waitqueue_head(&wsm.wait); + timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0); + + INIT_LIST_HEAD(&wsm.lru_list); + for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) + INIT_LIST_HEAD(&wsm.idle_ws[i]); + + ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL); + if (IS_ERR(ws)) { + pr_warn( + "BTRFS: cannot preallocate zstd compression workspace\n"); + } else { + set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map); + list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]); + } +} + +void zstd_cleanup_workspace_manager(void) +{ + struct workspace *workspace; + int i; + + spin_lock_bh(&wsm.lock); + for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) { + while (!list_empty(&wsm.idle_ws[i])) { + workspace = container_of(wsm.idle_ws[i].next, + struct workspace, list); + list_del(&workspace->list); + list_del(&workspace->lru_list); + zstd_free_workspace(&workspace->list); + } + } + spin_unlock_bh(&wsm.lock); + + del_timer_sync(&wsm.timer); +} + +/* + * zstd_find_workspace - find workspace + * @level: compression level + * + * This iterates over the set bits in the active_map beginning at the requested + * compression level. This lets us utilize already allocated workspaces before + * allocating a new one. If the workspace is of a larger size, it is used, but + * the place in the lru_list and last_used times are not updated. This is to + * offer the opportunity to reclaim the workspace in favor of allocating an + * appropriately sized one in the future. + */ +static struct list_head *zstd_find_workspace(unsigned int level) +{ + struct list_head *ws; + struct workspace *workspace; + int i = level - 1; + + spin_lock_bh(&wsm.lock); + for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) { + if (!list_empty(&wsm.idle_ws[i])) { + ws = wsm.idle_ws[i].next; + workspace = list_to_workspace(ws); + list_del_init(ws); + /* keep its place if it's a lower level using this */ + workspace->req_level = level; + if (level == workspace->level) + list_del(&workspace->lru_list); + if (list_empty(&wsm.idle_ws[i])) + clear_bit(i, &wsm.active_map); + spin_unlock_bh(&wsm.lock); + return ws; + } + } + spin_unlock_bh(&wsm.lock); + + return NULL; +} + +/* + * zstd_get_workspace - zstd's get_workspace + * @level: compression level + * + * If @level is 0, then any compression level can be used. Therefore, we begin + * scanning from 1. We first scan through possible workspaces and then after + * attempt to allocate a new workspace. If we fail to allocate one due to + * memory pressure, go to sleep waiting for the max level workspace to free up. + */ +struct list_head *zstd_get_workspace(unsigned int level) +{ + struct list_head *ws; + unsigned int nofs_flag; + + /* level == 0 means we can use any workspace */ + if (!level) + level = 1; + +again: + ws = zstd_find_workspace(level); + if (ws) + return ws; + + nofs_flag = memalloc_nofs_save(); + ws = zstd_alloc_workspace(level); + memalloc_nofs_restore(nofs_flag); + + if (IS_ERR(ws)) { + DEFINE_WAIT(wait); + + prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE); + schedule(); + finish_wait(&wsm.wait, &wait); + + goto again; + } + + return ws; +} + +/* + * zstd_put_workspace - zstd put_workspace + * @ws: list_head for the workspace + * + * When putting back a workspace, we only need to update the LRU if we are of + * the requested compression level. Here is where we continue to protect the + * max level workspace or update last_used accordingly. If the reclaim timer + * isn't set, it is also set here. Only the max level workspace tries and wakes + * up waiting workspaces. + */ +void zstd_put_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_to_workspace(ws); + + spin_lock_bh(&wsm.lock); + + /* A node is only taken off the lru if we are the corresponding level */ + if (workspace->req_level == workspace->level) { + /* Hide a max level workspace from reclaim */ + if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { + INIT_LIST_HEAD(&workspace->lru_list); + } else { + workspace->last_used = jiffies; + list_add(&workspace->lru_list, &wsm.lru_list); + if (!timer_pending(&wsm.timer)) + mod_timer(&wsm.timer, + jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); + } + } + + set_bit(workspace->level - 1, &wsm.active_map); + list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]); + workspace->req_level = 0; + + spin_unlock_bh(&wsm.lock); + + if (workspace->level == ZSTD_BTRFS_MAX_LEVEL) + cond_wake_up(&wsm.wait); +} + +void zstd_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + + kvfree(workspace->mem); + kfree(workspace->buf); + kfree(workspace); +} + +struct list_head *zstd_alloc_workspace(unsigned int level) +{ + struct workspace *workspace; + + workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); + if (!workspace) + return ERR_PTR(-ENOMEM); + + workspace->size = zstd_ws_mem_sizes[level - 1]; + workspace->level = level; + workspace->req_level = level; + workspace->last_used = jiffies; + workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); + workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!workspace->mem || !workspace->buf) + goto fail; + + INIT_LIST_HEAD(&workspace->list); + INIT_LIST_HEAD(&workspace->lru_list); + + return &workspace->list; +fail: + zstd_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); +} + +int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + zstd_cstream *stream; + int ret = 0; + int nr_pages = 0; + struct page *in_page = NULL; /* The current page to read */ + struct page *out_page = NULL; /* The current page to write to */ + unsigned long tot_in = 0; + unsigned long tot_out = 0; + unsigned long len = *total_out; + const unsigned long nr_dest_pages = *out_pages; + unsigned long max_out = nr_dest_pages * PAGE_SIZE; + zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, + len); + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + /* Initialize the stream */ + stream = zstd_init_cstream(¶ms, len, workspace->mem, + workspace->size); + if (!stream) { + pr_warn("BTRFS: zstd_init_cstream failed\n"); + ret = -EIO; + goto out; + } + + /* map in the first page of input data */ + in_page = find_get_page(mapping, start >> PAGE_SHIFT); + workspace->in_buf.src = kmap_local_page(in_page); + workspace->in_buf.pos = 0; + workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); + + + /* Allocate and map in the output buffer */ + out_page = alloc_page(GFP_NOFS); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + pages[nr_pages++] = out_page; + workspace->out_buf.dst = page_address(out_page); + workspace->out_buf.pos = 0; + workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + + while (1) { + size_t ret2; + + ret2 = zstd_compress_stream(stream, &workspace->out_buf, + &workspace->in_buf); + if (zstd_is_error(ret2)) { + pr_debug("BTRFS: zstd_compress_stream returned %d\n", + zstd_get_error_code(ret2)); + ret = -EIO; + goto out; + } + + /* Check to see if we are making it bigger */ + if (tot_in + workspace->in_buf.pos > 8192 && + tot_in + workspace->in_buf.pos < + tot_out + workspace->out_buf.pos) { + ret = -E2BIG; + goto out; + } + + /* We've reached the end of our output range */ + if (workspace->out_buf.pos >= max_out) { + tot_out += workspace->out_buf.pos; + ret = -E2BIG; + goto out; + } + + /* Check if we need more output space */ + if (workspace->out_buf.pos == workspace->out_buf.size) { + tot_out += PAGE_SIZE; + max_out -= PAGE_SIZE; + if (nr_pages == nr_dest_pages) { + ret = -E2BIG; + goto out; + } + out_page = alloc_page(GFP_NOFS); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + pages[nr_pages++] = out_page; + workspace->out_buf.dst = page_address(out_page); + workspace->out_buf.pos = 0; + workspace->out_buf.size = min_t(size_t, max_out, + PAGE_SIZE); + } + + /* We've reached the end of the input */ + if (workspace->in_buf.pos >= len) { + tot_in += workspace->in_buf.pos; + break; + } + + /* Check if we need more input */ + if (workspace->in_buf.pos == workspace->in_buf.size) { + tot_in += PAGE_SIZE; + kunmap_local(workspace->in_buf.src); + put_page(in_page); + start += PAGE_SIZE; + len -= PAGE_SIZE; + in_page = find_get_page(mapping, start >> PAGE_SHIFT); + workspace->in_buf.src = kmap_local_page(in_page); + workspace->in_buf.pos = 0; + workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); + } + } + while (1) { + size_t ret2; + + ret2 = zstd_end_stream(stream, &workspace->out_buf); + if (zstd_is_error(ret2)) { + pr_debug("BTRFS: zstd_end_stream returned %d\n", + zstd_get_error_code(ret2)); + ret = -EIO; + goto out; + } + if (ret2 == 0) { + tot_out += workspace->out_buf.pos; + break; + } + if (workspace->out_buf.pos >= max_out) { + tot_out += workspace->out_buf.pos; + ret = -E2BIG; + goto out; + } + + tot_out += PAGE_SIZE; + max_out -= PAGE_SIZE; + if (nr_pages == nr_dest_pages) { + ret = -E2BIG; + goto out; + } + out_page = alloc_page(GFP_NOFS); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + pages[nr_pages++] = out_page; + workspace->out_buf.dst = page_address(out_page); + workspace->out_buf.pos = 0; + workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + } + + if (tot_out >= tot_in) { + ret = -E2BIG; + goto out; + } + + ret = 0; + *total_in = tot_in; + *total_out = tot_out; +out: + *out_pages = nr_pages; + if (workspace->in_buf.src) { + kunmap_local(workspace->in_buf.src); + put_page(in_page); + } + return ret; +} + +int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + struct page **pages_in = cb->compressed_pages; + size_t srclen = cb->compressed_len; + zstd_dstream *stream; + int ret = 0; + unsigned long page_in_index = 0; + unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long buf_start; + unsigned long total_out = 0; + + stream = zstd_init_dstream( + ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); + if (!stream) { + pr_debug("BTRFS: zstd_init_dstream failed\n"); + ret = -EIO; + goto done; + } + + workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); + workspace->in_buf.pos = 0; + workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + + workspace->out_buf.dst = workspace->buf; + workspace->out_buf.pos = 0; + workspace->out_buf.size = PAGE_SIZE; + + while (1) { + size_t ret2; + + ret2 = zstd_decompress_stream(stream, &workspace->out_buf, + &workspace->in_buf); + if (zstd_is_error(ret2)) { + pr_debug("BTRFS: zstd_decompress_stream returned %d\n", + zstd_get_error_code(ret2)); + ret = -EIO; + goto done; + } + buf_start = total_out; + total_out += workspace->out_buf.pos; + workspace->out_buf.pos = 0; + + ret = btrfs_decompress_buf2page(workspace->out_buf.dst, + total_out - buf_start, cb, buf_start); + if (ret == 0) + break; + + if (workspace->in_buf.pos >= srclen) + break; + + /* Check if we've hit the end of a frame */ + if (ret2 == 0) + break; + + if (workspace->in_buf.pos == workspace->in_buf.size) { + kunmap_local(workspace->in_buf.src); + page_in_index++; + if (page_in_index >= total_pages_in) { + workspace->in_buf.src = NULL; + ret = -EIO; + goto done; + } + srclen -= PAGE_SIZE; + workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); + workspace->in_buf.pos = 0; + workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + } + } + ret = 0; +done: + if (workspace->in_buf.src) + kunmap_local(workspace->in_buf.src); + return ret; +} + +int zstd_decompress(struct list_head *ws, const u8 *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + zstd_dstream *stream; + int ret = 0; + size_t ret2; + unsigned long total_out = 0; + unsigned long pg_offset = 0; + + stream = zstd_init_dstream( + ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); + if (!stream) { + pr_warn("BTRFS: zstd_init_dstream failed\n"); + ret = -EIO; + goto finish; + } + + destlen = min_t(size_t, destlen, PAGE_SIZE); + + workspace->in_buf.src = data_in; + workspace->in_buf.pos = 0; + workspace->in_buf.size = srclen; + + workspace->out_buf.dst = workspace->buf; + workspace->out_buf.pos = 0; + workspace->out_buf.size = PAGE_SIZE; + + ret2 = 1; + while (pg_offset < destlen + && workspace->in_buf.pos < workspace->in_buf.size) { + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + + /* Check if the frame is over and we still need more input */ + if (ret2 == 0) { + pr_debug("BTRFS: zstd_decompress_stream ended early\n"); + ret = -EIO; + goto finish; + } + ret2 = zstd_decompress_stream(stream, &workspace->out_buf, + &workspace->in_buf); + if (zstd_is_error(ret2)) { + pr_debug("BTRFS: zstd_decompress_stream returned %d\n", + zstd_get_error_code(ret2)); + ret = -EIO; + goto finish; + } + + buf_start = total_out; + total_out += workspace->out_buf.pos; + workspace->out_buf.pos = 0; + + if (total_out <= start_byte) + continue; + + if (total_out > start_byte && buf_start < start_byte) + buf_offset = start_byte - buf_start; + else + buf_offset = 0; + + bytes = min_t(unsigned long, destlen - pg_offset, + workspace->out_buf.size - buf_offset); + + memcpy_to_page(dest_page, pg_offset, + workspace->out_buf.dst + buf_offset, bytes); + + pg_offset += bytes; + } + ret = 0; +finish: + if (pg_offset < destlen) { + memzero_page(dest_page, pg_offset, destlen - pg_offset); + } + return ret; +} + +const struct btrfs_compress_op btrfs_zstd_compress = { + /* ZSTD uses own workspace manager */ + .workspace_manager = NULL, + .max_level = ZSTD_BTRFS_MAX_LEVEL, + .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, +}; -- cgit v1.2.3